1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GCN,GFX9
3 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GCN,VI
4 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GFX10
5 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GFX11
7 ; FIXME: Need to handle non-uniform case for function below (load without gep).
8 define amdgpu_kernel void @v_test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 {
9 ; GFX9-LABEL: v_test_sub_v2i16:
11 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
12 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
13 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
14 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
15 ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
16 ; GFX9-NEXT: s_waitcnt vmcnt(0)
17 ; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc
18 ; GFX9-NEXT: s_waitcnt vmcnt(0)
19 ; GFX9-NEXT: s_mov_b32 s7, 0xf000
20 ; GFX9-NEXT: s_mov_b32 s6, -1
21 ; GFX9-NEXT: v_pk_sub_i16 v0, v1, v2
22 ; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
25 ; VI-LABEL: v_test_sub_v2i16:
27 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
28 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
29 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
30 ; VI-NEXT: s_waitcnt lgkmcnt(0)
31 ; VI-NEXT: v_mov_b32_e32 v1, s7
32 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
33 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
34 ; VI-NEXT: v_mov_b32_e32 v3, s1
35 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
36 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
37 ; VI-NEXT: flat_load_dword v0, v[0:1] glc
38 ; VI-NEXT: s_waitcnt vmcnt(0)
39 ; VI-NEXT: flat_load_dword v1, v[2:3] glc
40 ; VI-NEXT: s_waitcnt vmcnt(0)
41 ; VI-NEXT: s_mov_b32 s7, 0xf000
42 ; VI-NEXT: s_mov_b32 s6, -1
43 ; VI-NEXT: v_sub_u16_e32 v2, v0, v1
44 ; VI-NEXT: v_sub_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
45 ; VI-NEXT: v_or_b32_e32 v0, v2, v0
46 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
49 ; GFX10-LABEL: v_test_sub_v2i16:
51 ; GFX10-NEXT: s_clause 0x1
52 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
53 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
54 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
55 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
56 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc
57 ; GFX10-NEXT: s_waitcnt vmcnt(0)
58 ; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc
59 ; GFX10-NEXT: s_waitcnt vmcnt(0)
60 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
61 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000
62 ; GFX10-NEXT: s_mov_b32 s6, -1
63 ; GFX10-NEXT: v_pk_sub_i16 v0, v1, v2
64 ; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0
65 ; GFX10-NEXT: s_endpgm
67 ; GFX11-LABEL: v_test_sub_v2i16:
69 ; GFX11-NEXT: s_clause 0x1
70 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
71 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
72 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
73 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
74 ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
75 ; GFX11-NEXT: s_waitcnt vmcnt(0)
76 ; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] glc dlc
77 ; GFX11-NEXT: s_waitcnt vmcnt(0)
78 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000
79 ; GFX11-NEXT: s_mov_b32 s6, -1
80 ; GFX11-NEXT: v_pk_sub_i16 v0, v1, v0
81 ; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
83 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
84 ; GFX11-NEXT: s_endpgm
85 %tid = call i32 @llvm.amdgcn.workitem.id.x()
86 %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid
87 %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid
88 %gep.in1 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in1, i32 %tid
89 %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0
90 %b = load volatile <2 x i16>, ptr addrspace(1) %gep.in1
91 %add = sub <2 x i16> %a, %b
92 store <2 x i16> %add, ptr addrspace(1) %out
96 define amdgpu_kernel void @s_test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace(4) %in0, ptr addrspace(4) %in1) #1 {
97 ; GFX9-LABEL: s_test_sub_v2i16:
99 ; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
100 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
101 ; GFX9-NEXT: s_mov_b32 s3, 0xf000
102 ; GFX9-NEXT: s_mov_b32 s2, -1
103 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
104 ; GFX9-NEXT: s_load_dword s10, s[8:9], 0x0
105 ; GFX9-NEXT: s_load_dword s11, s[6:7], 0x0
106 ; GFX9-NEXT: s_mov_b32 s0, s4
107 ; GFX9-NEXT: s_mov_b32 s1, s5
108 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
109 ; GFX9-NEXT: v_mov_b32_e32 v0, s10
110 ; GFX9-NEXT: v_pk_sub_i16 v0, s11, v0
111 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
112 ; GFX9-NEXT: s_endpgm
114 ; VI-LABEL: s_test_sub_v2i16:
116 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
117 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
118 ; VI-NEXT: s_mov_b32 s3, 0xf000
119 ; VI-NEXT: s_mov_b32 s2, -1
120 ; VI-NEXT: s_waitcnt lgkmcnt(0)
121 ; VI-NEXT: s_load_dword s6, s[6:7], 0x0
122 ; VI-NEXT: s_load_dword s7, s[0:1], 0x0
123 ; VI-NEXT: s_mov_b32 s0, s4
124 ; VI-NEXT: s_mov_b32 s1, s5
125 ; VI-NEXT: s_waitcnt lgkmcnt(0)
126 ; VI-NEXT: s_lshr_b32 s4, s6, 16
127 ; VI-NEXT: s_lshr_b32 s5, s7, 16
128 ; VI-NEXT: s_sub_i32 s6, s6, s7
129 ; VI-NEXT: s_sub_i32 s4, s4, s5
130 ; VI-NEXT: s_and_b32 s5, s6, 0xffff
131 ; VI-NEXT: s_lshl_b32 s4, s4, 16
132 ; VI-NEXT: s_or_b32 s4, s5, s4
133 ; VI-NEXT: v_mov_b32_e32 v0, s4
134 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
137 ; GFX10-LABEL: s_test_sub_v2i16:
139 ; GFX10-NEXT: s_clause 0x1
140 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
141 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
142 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
143 ; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0
144 ; GFX10-NEXT: s_load_dword s1, s[2:3], 0x0
145 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000
146 ; GFX10-NEXT: s_mov_b32 s6, -1
147 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
148 ; GFX10-NEXT: v_pk_sub_i16 v0, s0, s1
149 ; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0
150 ; GFX10-NEXT: s_endpgm
152 ; GFX11-LABEL: s_test_sub_v2i16:
154 ; GFX11-NEXT: s_clause 0x1
155 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
156 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
157 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
158 ; GFX11-NEXT: s_load_b32 s2, s[6:7], 0x0
159 ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
160 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000
161 ; GFX11-NEXT: s_mov_b32 s6, -1
162 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
163 ; GFX11-NEXT: v_pk_sub_i16 v0, s2, s0
164 ; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
165 ; GFX11-NEXT: s_nop 0
166 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
167 ; GFX11-NEXT: s_endpgm
168 %a = load <2 x i16>, ptr addrspace(4) %in0
169 %b = load <2 x i16>, ptr addrspace(4) %in1
170 %add = sub <2 x i16> %a, %b
171 store <2 x i16> %add, ptr addrspace(1) %out
175 define amdgpu_kernel void @s_test_sub_self_v2i16(ptr addrspace(1) %out, ptr addrspace(4) %in0) #1 {
176 ; GCN-LABEL: s_test_sub_self_v2i16:
178 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
179 ; GCN-NEXT: s_mov_b32 s3, 0xf000
180 ; GCN-NEXT: s_mov_b32 s2, -1
181 ; GCN-NEXT: v_mov_b32_e32 v0, 0
182 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
183 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
186 ; GFX10-LABEL: s_test_sub_self_v2i16:
188 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
189 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
190 ; GFX10-NEXT: s_mov_b32 s3, 0x31016000
191 ; GFX10-NEXT: s_mov_b32 s2, -1
192 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
193 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0
194 ; GFX10-NEXT: s_endpgm
196 ; GFX11-LABEL: s_test_sub_self_v2i16:
198 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
199 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
200 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000
201 ; GFX11-NEXT: s_mov_b32 s2, -1
202 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
203 ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
204 ; GFX11-NEXT: s_nop 0
205 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
206 ; GFX11-NEXT: s_endpgm
207 %a = load <2 x i16>, ptr addrspace(4) %in0
208 %add = sub <2 x i16> %a, %a
209 store <2 x i16> %add, ptr addrspace(1) %out
213 ; FIXME: VI should not scalarize arg access.
214 define amdgpu_kernel void @s_test_sub_v2i16_kernarg(ptr addrspace(1) %out, <2 x i16> %a, <2 x i16> %b) #1 {
215 ; GFX9-LABEL: s_test_sub_v2i16_kernarg:
217 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
218 ; GFX9-NEXT: s_mov_b32 s7, 0xf000
219 ; GFX9-NEXT: s_mov_b32 s6, -1
220 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
221 ; GFX9-NEXT: v_mov_b32_e32 v0, s3
222 ; GFX9-NEXT: s_mov_b32 s4, s0
223 ; GFX9-NEXT: s_mov_b32 s5, s1
224 ; GFX9-NEXT: v_pk_sub_i16 v0, s2, v0
225 ; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
226 ; GFX9-NEXT: s_endpgm
228 ; VI-LABEL: s_test_sub_v2i16_kernarg:
230 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
231 ; VI-NEXT: s_mov_b32 s7, 0xf000
232 ; VI-NEXT: s_mov_b32 s6, -1
233 ; VI-NEXT: s_waitcnt lgkmcnt(0)
234 ; VI-NEXT: s_mov_b32 s4, s0
235 ; VI-NEXT: s_mov_b32 s5, s1
236 ; VI-NEXT: s_lshr_b32 s0, s2, 16
237 ; VI-NEXT: s_lshr_b32 s1, s3, 16
238 ; VI-NEXT: s_sub_i32 s0, s0, s1
239 ; VI-NEXT: s_sub_i32 s1, s2, s3
240 ; VI-NEXT: s_lshl_b32 s0, s0, 16
241 ; VI-NEXT: s_and_b32 s1, s1, 0xffff
242 ; VI-NEXT: s_or_b32 s0, s1, s0
243 ; VI-NEXT: v_mov_b32_e32 v0, s0
244 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
247 ; GFX10-LABEL: s_test_sub_v2i16_kernarg:
249 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
250 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000
251 ; GFX10-NEXT: s_mov_b32 s6, -1
252 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
253 ; GFX10-NEXT: v_pk_sub_i16 v0, s2, s3
254 ; GFX10-NEXT: s_mov_b32 s4, s0
255 ; GFX10-NEXT: s_mov_b32 s5, s1
256 ; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0
257 ; GFX10-NEXT: s_endpgm
259 ; GFX11-LABEL: s_test_sub_v2i16_kernarg:
261 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
262 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000
263 ; GFX11-NEXT: s_mov_b32 s6, -1
264 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
265 ; GFX11-NEXT: v_pk_sub_i16 v0, s2, s3
266 ; GFX11-NEXT: s_mov_b32 s4, s0
267 ; GFX11-NEXT: s_mov_b32 s5, s1
268 ; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
269 ; GFX11-NEXT: s_nop 0
270 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
271 ; GFX11-NEXT: s_endpgm
272 %add = sub <2 x i16> %a, %b
273 store <2 x i16> %add, ptr addrspace(1) %out
277 define amdgpu_kernel void @v_test_sub_v2i16_constant(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 {
278 ; GFX9-LABEL: v_test_sub_v2i16_constant:
280 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
281 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
282 ; GFX9-NEXT: s_mov_b32 s4, 0x1c8007b
283 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
284 ; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc
285 ; GFX9-NEXT: s_waitcnt vmcnt(0)
286 ; GFX9-NEXT: s_mov_b32 s3, 0xf000
287 ; GFX9-NEXT: s_mov_b32 s2, -1
288 ; GFX9-NEXT: v_pk_sub_i16 v0, v0, s4
289 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
290 ; GFX9-NEXT: s_endpgm
292 ; VI-LABEL: v_test_sub_v2i16_constant:
294 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
295 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
296 ; VI-NEXT: s_waitcnt lgkmcnt(0)
297 ; VI-NEXT: v_mov_b32_e32 v1, s3
298 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
299 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
300 ; VI-NEXT: flat_load_dword v0, v[0:1] glc
301 ; VI-NEXT: s_waitcnt vmcnt(0)
302 ; VI-NEXT: v_mov_b32_e32 v1, 0xfffffe38
303 ; VI-NEXT: s_mov_b32 s3, 0xf000
304 ; VI-NEXT: s_mov_b32 s2, -1
305 ; VI-NEXT: v_add_u16_e32 v2, 0xff85, v0
306 ; VI-NEXT: v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
307 ; VI-NEXT: v_or_b32_e32 v0, v2, v0
308 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
311 ; GFX10-LABEL: v_test_sub_v2i16_constant:
313 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
314 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
315 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
316 ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc
317 ; GFX10-NEXT: s_waitcnt vmcnt(0)
318 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
319 ; GFX10-NEXT: s_mov_b32 s3, 0x31016000
320 ; GFX10-NEXT: s_mov_b32 s2, -1
321 ; GFX10-NEXT: v_pk_sub_i16 v0, v0, 0x1c8007b
322 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0
323 ; GFX10-NEXT: s_endpgm
325 ; GFX11-LABEL: v_test_sub_v2i16_constant:
327 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
328 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
329 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
330 ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc
331 ; GFX11-NEXT: s_waitcnt vmcnt(0)
332 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000
333 ; GFX11-NEXT: s_mov_b32 s2, -1
334 ; GFX11-NEXT: v_pk_sub_i16 v0, v0, 0x1c8007b
335 ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
336 ; GFX11-NEXT: s_nop 0
337 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
338 ; GFX11-NEXT: s_endpgm
339 %tid = call i32 @llvm.amdgcn.workitem.id.x()
340 %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid
341 %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid
342 %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0
343 %add = sub <2 x i16> %a, <i16 123, i16 456>
344 store <2 x i16> %add, ptr addrspace(1) %out
348 ; FIXME: Need to handle non-uniform case for function below (load without gep).
349 define amdgpu_kernel void @v_test_sub_v2i16_neg_constant(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 {
350 ; GFX9-LABEL: v_test_sub_v2i16_neg_constant:
352 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
353 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
354 ; GFX9-NEXT: s_mov_b32 s4, 0xfc21fcb3
355 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
356 ; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc
357 ; GFX9-NEXT: s_waitcnt vmcnt(0)
358 ; GFX9-NEXT: s_mov_b32 s3, 0xf000
359 ; GFX9-NEXT: s_mov_b32 s2, -1
360 ; GFX9-NEXT: v_pk_sub_i16 v0, v0, s4
361 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
362 ; GFX9-NEXT: s_endpgm
364 ; VI-LABEL: v_test_sub_v2i16_neg_constant:
366 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
367 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
368 ; VI-NEXT: s_waitcnt lgkmcnt(0)
369 ; VI-NEXT: v_mov_b32_e32 v1, s3
370 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
371 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
372 ; VI-NEXT: flat_load_dword v0, v[0:1] glc
373 ; VI-NEXT: s_waitcnt vmcnt(0)
374 ; VI-NEXT: v_mov_b32_e32 v1, 0x3df
375 ; VI-NEXT: s_mov_b32 s3, 0xf000
376 ; VI-NEXT: s_mov_b32 s2, -1
377 ; VI-NEXT: v_add_u16_e32 v2, 0x34d, v0
378 ; VI-NEXT: v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
379 ; VI-NEXT: v_or_b32_e32 v0, v2, v0
380 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
383 ; GFX10-LABEL: v_test_sub_v2i16_neg_constant:
385 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
386 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
387 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
388 ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc
389 ; GFX10-NEXT: s_waitcnt vmcnt(0)
390 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
391 ; GFX10-NEXT: s_mov_b32 s3, 0x31016000
392 ; GFX10-NEXT: s_mov_b32 s2, -1
393 ; GFX10-NEXT: v_pk_sub_i16 v0, v0, 0xfc21fcb3
394 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0
395 ; GFX10-NEXT: s_endpgm
397 ; GFX11-LABEL: v_test_sub_v2i16_neg_constant:
399 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
400 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
401 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
402 ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc
403 ; GFX11-NEXT: s_waitcnt vmcnt(0)
404 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000
405 ; GFX11-NEXT: s_mov_b32 s2, -1
406 ; GFX11-NEXT: v_pk_sub_i16 v0, v0, 0xfc21fcb3
407 ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
408 ; GFX11-NEXT: s_nop 0
409 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
410 ; GFX11-NEXT: s_endpgm
411 %tid = call i32 @llvm.amdgcn.workitem.id.x()
412 %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid
413 %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid
414 %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0
415 %add = sub <2 x i16> %a, <i16 -845, i16 -991>
416 store <2 x i16> %add, ptr addrspace(1) %out
420 define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 {
421 ; GFX9-LABEL: v_test_sub_v2i16_inline_neg1:
423 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
424 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
425 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
426 ; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc
427 ; GFX9-NEXT: s_waitcnt vmcnt(0)
428 ; GFX9-NEXT: s_mov_b32 s3, 0xf000
429 ; GFX9-NEXT: s_mov_b32 s2, -1
430 ; GFX9-NEXT: v_pk_sub_i16 v0, v0, -1
431 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
432 ; GFX9-NEXT: s_endpgm
434 ; VI-LABEL: v_test_sub_v2i16_inline_neg1:
436 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
437 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
438 ; VI-NEXT: s_waitcnt lgkmcnt(0)
439 ; VI-NEXT: v_mov_b32_e32 v1, s3
440 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
441 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
442 ; VI-NEXT: flat_load_dword v0, v[0:1] glc
443 ; VI-NEXT: s_waitcnt vmcnt(0)
444 ; VI-NEXT: v_mov_b32_e32 v1, 1
445 ; VI-NEXT: s_mov_b32 s3, 0xf000
446 ; VI-NEXT: s_mov_b32 s2, -1
447 ; VI-NEXT: v_add_u16_e32 v2, 1, v0
448 ; VI-NEXT: v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
449 ; VI-NEXT: v_or_b32_e32 v0, v2, v0
450 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
453 ; GFX10-LABEL: v_test_sub_v2i16_inline_neg1:
455 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
456 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
457 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
458 ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc
459 ; GFX10-NEXT: s_waitcnt vmcnt(0)
460 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
461 ; GFX10-NEXT: s_mov_b32 s3, 0x31016000
462 ; GFX10-NEXT: s_mov_b32 s2, -1
463 ; GFX10-NEXT: v_pk_sub_i16 v0, v0, -1
464 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0
465 ; GFX10-NEXT: s_endpgm
467 ; GFX11-LABEL: v_test_sub_v2i16_inline_neg1:
469 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
470 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
471 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
472 ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc
473 ; GFX11-NEXT: s_waitcnt vmcnt(0)
474 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000
475 ; GFX11-NEXT: s_mov_b32 s2, -1
476 ; GFX11-NEXT: v_pk_sub_i16 v0, v0, -1
477 ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
478 ; GFX11-NEXT: s_nop 0
479 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
480 ; GFX11-NEXT: s_endpgm
481 %tid = call i32 @llvm.amdgcn.workitem.id.x()
482 %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid
483 %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid
484 %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0
485 %add = sub <2 x i16> %a, <i16 -1, i16 -1>
486 store <2 x i16> %add, ptr addrspace(1) %out
490 define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 {
491 ; GFX9-LABEL: v_test_sub_v2i16_inline_lo_zero_hi:
493 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
494 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
495 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
496 ; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc
497 ; GFX9-NEXT: s_waitcnt vmcnt(0)
498 ; GFX9-NEXT: s_mov_b32 s3, 0xf000
499 ; GFX9-NEXT: s_mov_b32 s2, -1
500 ; GFX9-NEXT: v_pk_sub_i16 v0, v0, 32
501 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
502 ; GFX9-NEXT: s_endpgm
504 ; VI-LABEL: v_test_sub_v2i16_inline_lo_zero_hi:
506 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
507 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
508 ; VI-NEXT: s_waitcnt lgkmcnt(0)
509 ; VI-NEXT: v_mov_b32_e32 v1, s3
510 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
511 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
512 ; VI-NEXT: flat_load_dword v0, v[0:1] glc
513 ; VI-NEXT: s_waitcnt vmcnt(0)
514 ; VI-NEXT: s_mov_b32 s3, 0xf000
515 ; VI-NEXT: s_mov_b32 s2, -1
516 ; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
517 ; VI-NEXT: v_subrev_u16_e32 v0, 32, v0
518 ; VI-NEXT: v_or_b32_e32 v0, v0, v1
519 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
522 ; GFX10-LABEL: v_test_sub_v2i16_inline_lo_zero_hi:
524 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
525 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
526 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
527 ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc
528 ; GFX10-NEXT: s_waitcnt vmcnt(0)
529 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
530 ; GFX10-NEXT: s_mov_b32 s3, 0x31016000
531 ; GFX10-NEXT: s_mov_b32 s2, -1
532 ; GFX10-NEXT: v_pk_sub_i16 v0, v0, 32
533 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0
534 ; GFX10-NEXT: s_endpgm
536 ; GFX11-LABEL: v_test_sub_v2i16_inline_lo_zero_hi:
538 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
539 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
540 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
541 ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc
542 ; GFX11-NEXT: s_waitcnt vmcnt(0)
543 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000
544 ; GFX11-NEXT: s_mov_b32 s2, -1
545 ; GFX11-NEXT: v_pk_sub_i16 v0, v0, 32
546 ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
547 ; GFX11-NEXT: s_nop 0
548 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
549 ; GFX11-NEXT: s_endpgm
550 %tid = call i32 @llvm.amdgcn.workitem.id.x()
551 %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid
552 %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid
553 %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0
554 %add = sub <2 x i16> %a, <i16 32, i16 0>
555 store <2 x i16> %add, ptr addrspace(1) %out
559 ; The high element gives fp
560 define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 {
561 ; GFX9-LABEL: v_test_sub_v2i16_inline_fp_split:
563 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
564 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
565 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
566 ; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc
567 ; GFX9-NEXT: s_waitcnt vmcnt(0)
568 ; GFX9-NEXT: s_mov_b32 s3, 0xf000
569 ; GFX9-NEXT: s_mov_b32 s2, -1
570 ; GFX9-NEXT: v_pk_sub_i16 v0, v0, 1.0
571 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
572 ; GFX9-NEXT: s_endpgm
574 ; VI-LABEL: v_test_sub_v2i16_inline_fp_split:
576 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
577 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
578 ; VI-NEXT: s_waitcnt lgkmcnt(0)
579 ; VI-NEXT: v_mov_b32_e32 v1, s3
580 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
581 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
582 ; VI-NEXT: flat_load_dword v0, v[0:1] glc
583 ; VI-NEXT: s_waitcnt vmcnt(0)
584 ; VI-NEXT: v_mov_b32_e32 v1, 0xffffc080
585 ; VI-NEXT: s_mov_b32 s3, 0xf000
586 ; VI-NEXT: s_mov_b32 s2, -1
587 ; VI-NEXT: v_add_u16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
588 ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
589 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
592 ; GFX10-LABEL: v_test_sub_v2i16_inline_fp_split:
594 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
595 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
596 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
597 ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc
598 ; GFX10-NEXT: s_waitcnt vmcnt(0)
599 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
600 ; GFX10-NEXT: s_mov_b32 s3, 0x31016000
601 ; GFX10-NEXT: s_mov_b32 s2, -1
602 ; GFX10-NEXT: v_pk_sub_i16 v0, v0, 1.0
603 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0
604 ; GFX10-NEXT: s_endpgm
606 ; GFX11-LABEL: v_test_sub_v2i16_inline_fp_split:
608 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
609 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
610 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
611 ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc
612 ; GFX11-NEXT: s_waitcnt vmcnt(0)
613 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000
614 ; GFX11-NEXT: s_mov_b32 s2, -1
615 ; GFX11-NEXT: v_pk_sub_i16 v0, v0, 1.0
616 ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
617 ; GFX11-NEXT: s_nop 0
618 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
619 ; GFX11-NEXT: s_endpgm
620 %tid = call i32 @llvm.amdgcn.workitem.id.x()
621 %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid
622 %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid
623 %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0
624 %add = sub <2 x i16> %a, <i16 0, i16 16256>
625 store <2 x i16> %add, ptr addrspace(1) %out
629 ; FIXME: Need to handle non-uniform case for function below (load without gep).
630 define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 {
631 ; GFX9-LABEL: v_test_sub_v2i16_zext_to_v2i32:
633 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
634 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
635 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
636 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
637 ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
638 ; GFX9-NEXT: s_waitcnt vmcnt(0)
639 ; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc
640 ; GFX9-NEXT: s_waitcnt vmcnt(0)
641 ; GFX9-NEXT: s_mov_b32 s7, 0xf000
642 ; GFX9-NEXT: s_mov_b32 s6, -1
643 ; GFX9-NEXT: v_pk_sub_i16 v0, v1, v2
644 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
645 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
646 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
647 ; GFX9-NEXT: s_endpgm
649 ; VI-LABEL: v_test_sub_v2i16_zext_to_v2i32:
651 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
652 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
653 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
654 ; VI-NEXT: s_waitcnt lgkmcnt(0)
655 ; VI-NEXT: v_mov_b32_e32 v1, s7
656 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
657 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
658 ; VI-NEXT: v_mov_b32_e32 v3, s1
659 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
660 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
661 ; VI-NEXT: flat_load_dword v1, v[0:1] glc
662 ; VI-NEXT: s_waitcnt vmcnt(0)
663 ; VI-NEXT: flat_load_dword v2, v[2:3] glc
664 ; VI-NEXT: s_waitcnt vmcnt(0)
665 ; VI-NEXT: s_mov_b32 s7, 0xf000
666 ; VI-NEXT: s_mov_b32 s6, -1
667 ; VI-NEXT: v_sub_u16_e32 v0, v1, v2
668 ; VI-NEXT: v_sub_u16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
669 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
672 ; GFX10-LABEL: v_test_sub_v2i16_zext_to_v2i32:
674 ; GFX10-NEXT: s_clause 0x1
675 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
676 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
677 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
678 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
679 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc
680 ; GFX10-NEXT: s_waitcnt vmcnt(0)
681 ; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc
682 ; GFX10-NEXT: s_waitcnt vmcnt(0)
683 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
684 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000
685 ; GFX10-NEXT: s_mov_b32 s6, -1
686 ; GFX10-NEXT: v_pk_sub_i16 v0, v1, v2
687 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
688 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
689 ; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
690 ; GFX10-NEXT: s_endpgm
692 ; GFX11-LABEL: v_test_sub_v2i16_zext_to_v2i32:
694 ; GFX11-NEXT: s_clause 0x1
695 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
696 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
697 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
698 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
699 ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
700 ; GFX11-NEXT: s_waitcnt vmcnt(0)
701 ; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] glc dlc
702 ; GFX11-NEXT: s_waitcnt vmcnt(0)
703 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000
704 ; GFX11-NEXT: s_mov_b32 s6, -1
705 ; GFX11-NEXT: v_pk_sub_i16 v0, v1, v0
706 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
707 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
708 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
709 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
710 ; GFX11-NEXT: s_nop 0
711 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
712 ; GFX11-NEXT: s_endpgm
713 %tid = call i32 @llvm.amdgcn.workitem.id.x()
714 %gep.out = getelementptr inbounds <2 x i32>, ptr addrspace(1) %out, i32 %tid
715 %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid
716 %gep.in1 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in1, i32 %tid
717 %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0
718 %b = load volatile <2 x i16>, ptr addrspace(1) %gep.in1
719 %add = sub <2 x i16> %a, %b
720 %ext = zext <2 x i16> %add to <2 x i32>
721 store <2 x i32> %ext, ptr addrspace(1) %out
725 ; FIXME: Need to handle non-uniform case for function below (load without gep).
726 define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 {
727 ; GFX9-LABEL: v_test_sub_v2i16_zext_to_v2i64:
729 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
730 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
731 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
732 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
733 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
734 ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] glc
735 ; GFX9-NEXT: s_waitcnt vmcnt(0)
736 ; GFX9-NEXT: global_load_dword v3, v0, s[2:3] glc
737 ; GFX9-NEXT: s_waitcnt vmcnt(0)
738 ; GFX9-NEXT: s_mov_b32 s7, 0xf000
739 ; GFX9-NEXT: s_mov_b32 s6, -1
740 ; GFX9-NEXT: v_pk_sub_i16 v0, v2, v3
741 ; GFX9-NEXT: v_alignbit_b32 v2, 0, v0, 16
742 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
743 ; GFX9-NEXT: v_mov_b32_e32 v3, v1
744 ; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
745 ; GFX9-NEXT: s_endpgm
747 ; VI-LABEL: v_test_sub_v2i16_zext_to_v2i64:
749 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
750 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
751 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
752 ; VI-NEXT: s_waitcnt lgkmcnt(0)
753 ; VI-NEXT: v_mov_b32_e32 v1, s7
754 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
755 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
756 ; VI-NEXT: v_mov_b32_e32 v3, s1
757 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
758 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
759 ; VI-NEXT: flat_load_dword v4, v[0:1] glc
760 ; VI-NEXT: s_waitcnt vmcnt(0)
761 ; VI-NEXT: flat_load_dword v2, v[2:3] glc
762 ; VI-NEXT: s_waitcnt vmcnt(0)
763 ; VI-NEXT: v_mov_b32_e32 v1, 0
764 ; VI-NEXT: s_mov_b32 s7, 0xf000
765 ; VI-NEXT: s_mov_b32 s6, -1
766 ; VI-NEXT: v_mov_b32_e32 v3, v1
767 ; VI-NEXT: v_sub_u16_e32 v0, v4, v2
768 ; VI-NEXT: v_sub_u16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
769 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
772 ; GFX10-LABEL: v_test_sub_v2i16_zext_to_v2i64:
774 ; GFX10-NEXT: s_clause 0x1
775 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
776 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
777 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
778 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
779 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc
780 ; GFX10-NEXT: s_waitcnt vmcnt(0)
781 ; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc
782 ; GFX10-NEXT: s_waitcnt vmcnt(0)
783 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
784 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000
785 ; GFX10-NEXT: s_mov_b32 s6, -1
786 ; GFX10-NEXT: v_pk_sub_i16 v0, v1, v2
787 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
788 ; GFX10-NEXT: v_alignbit_b32 v2, 0, v0, 16
789 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
790 ; GFX10-NEXT: v_mov_b32_e32 v3, v1
791 ; GFX10-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
792 ; GFX10-NEXT: s_endpgm
794 ; GFX11-LABEL: v_test_sub_v2i16_zext_to_v2i64:
796 ; GFX11-NEXT: s_clause 0x1
797 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
798 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
799 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
800 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
801 ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
802 ; GFX11-NEXT: s_waitcnt vmcnt(0)
803 ; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] glc dlc
804 ; GFX11-NEXT: s_waitcnt vmcnt(0)
805 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000
806 ; GFX11-NEXT: s_mov_b32 s6, -1
807 ; GFX11-NEXT: v_pk_sub_i16 v0, v1, v0
808 ; GFX11-NEXT: v_mov_b32_e32 v1, 0
809 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
810 ; GFX11-NEXT: v_alignbit_b32 v2, 0, v0, 16
811 ; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v0, 0xffff, v0
812 ; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0
813 ; GFX11-NEXT: s_nop 0
814 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
815 ; GFX11-NEXT: s_endpgm
816 %tid = call i32 @llvm.amdgcn.workitem.id.x()
817 %gep.out = getelementptr inbounds <2 x i64>, ptr addrspace(1) %out, i32 %tid
818 %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid
819 %gep.in1 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in1, i32 %tid
820 %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0
821 %b = load volatile <2 x i16>, ptr addrspace(1) %gep.in1
822 %add = sub <2 x i16> %a, %b
823 %ext = zext <2 x i16> %add to <2 x i64>
824 store <2 x i64> %ext, ptr addrspace(1) %out
828 ; FIXME: Need to handle non-uniform case for function below (load without gep).
829 define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 {
830 ; GFX9-LABEL: v_test_sub_v2i16_sext_to_v2i32:
832 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
833 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
834 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
835 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
836 ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
837 ; GFX9-NEXT: s_waitcnt vmcnt(0)
838 ; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc
839 ; GFX9-NEXT: s_waitcnt vmcnt(0)
840 ; GFX9-NEXT: s_mov_b32 s7, 0xf000
841 ; GFX9-NEXT: s_mov_b32 s6, -1
842 ; GFX9-NEXT: v_pk_sub_i16 v0, v1, v2
843 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 16, v0
844 ; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 16
845 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
846 ; GFX9-NEXT: s_endpgm
848 ; VI-LABEL: v_test_sub_v2i16_sext_to_v2i32:
850 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
851 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
852 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
853 ; VI-NEXT: s_waitcnt lgkmcnt(0)
854 ; VI-NEXT: v_mov_b32_e32 v1, s7
855 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
856 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
857 ; VI-NEXT: v_mov_b32_e32 v3, s1
858 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
859 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
860 ; VI-NEXT: flat_load_dword v0, v[0:1] glc
861 ; VI-NEXT: s_waitcnt vmcnt(0)
862 ; VI-NEXT: flat_load_dword v1, v[2:3] glc
863 ; VI-NEXT: s_waitcnt vmcnt(0)
864 ; VI-NEXT: s_mov_b32 s7, 0xf000
865 ; VI-NEXT: s_mov_b32 s6, -1
866 ; VI-NEXT: v_sub_u16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
867 ; VI-NEXT: v_sub_u16_e32 v0, v0, v1
868 ; VI-NEXT: v_bfe_i32 v0, v0, 0, 16
869 ; VI-NEXT: v_bfe_i32 v1, v2, 0, 16
870 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
873 ; GFX10-LABEL: v_test_sub_v2i16_sext_to_v2i32:
875 ; GFX10-NEXT: s_clause 0x1
876 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
877 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
878 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
879 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
880 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc
881 ; GFX10-NEXT: s_waitcnt vmcnt(0)
882 ; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc
883 ; GFX10-NEXT: s_waitcnt vmcnt(0)
884 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
885 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000
886 ; GFX10-NEXT: s_mov_b32 s6, -1
887 ; GFX10-NEXT: v_pk_sub_i16 v0, v1, v2
888 ; GFX10-NEXT: v_ashrrev_i32_e32 v1, 16, v0
889 ; GFX10-NEXT: v_bfe_i32 v0, v0, 0, 16
890 ; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
891 ; GFX10-NEXT: s_endpgm
893 ; GFX11-LABEL: v_test_sub_v2i16_sext_to_v2i32:
895 ; GFX11-NEXT: s_clause 0x1
896 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
897 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
898 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
899 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
900 ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
901 ; GFX11-NEXT: s_waitcnt vmcnt(0)
902 ; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] glc dlc
903 ; GFX11-NEXT: s_waitcnt vmcnt(0)
904 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000
905 ; GFX11-NEXT: s_mov_b32 s6, -1
906 ; GFX11-NEXT: v_pk_sub_i16 v0, v1, v0
907 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
908 ; GFX11-NEXT: v_ashrrev_i32_e32 v1, 16, v0
909 ; GFX11-NEXT: v_bfe_i32 v0, v0, 0, 16
910 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
911 ; GFX11-NEXT: s_nop 0
912 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
913 ; GFX11-NEXT: s_endpgm
914 %tid = call i32 @llvm.amdgcn.workitem.id.x()
915 %gep.out = getelementptr inbounds <2 x i32>, ptr addrspace(1) %out, i32 %tid
916 %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid
917 %gep.in1 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in1, i32 %tid
918 %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0
919 %b = load volatile <2 x i16>, ptr addrspace(1) %gep.in1
920 %add = sub <2 x i16> %a, %b
921 %ext = sext <2 x i16> %add to <2 x i32>
922 store <2 x i32> %ext, ptr addrspace(1) %out
926 ; FIXME: Need to handle non-uniform case for function below (load without gep).
927 define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 {
928 ; GFX9-LABEL: v_test_sub_v2i16_sext_to_v2i64:
930 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
931 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
932 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
933 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
934 ; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
935 ; GFX9-NEXT: global_load_dword v2, v0, s[2:3]
936 ; GFX9-NEXT: s_mov_b32 s7, 0xf000
937 ; GFX9-NEXT: s_mov_b32 s6, -1
938 ; GFX9-NEXT: s_waitcnt vmcnt(0)
939 ; GFX9-NEXT: v_pk_sub_i16 v1, v1, v2
940 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v1
941 ; GFX9-NEXT: v_bfe_i32 v0, v1, 0, 16
942 ; GFX9-NEXT: v_bfe_i32 v2, v2, 0, 16
943 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0
944 ; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v2
945 ; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
946 ; GFX9-NEXT: s_endpgm
948 ; VI-LABEL: v_test_sub_v2i16_sext_to_v2i64:
950 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
951 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
952 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
953 ; VI-NEXT: s_waitcnt lgkmcnt(0)
954 ; VI-NEXT: v_mov_b32_e32 v1, s7
955 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
956 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
957 ; VI-NEXT: v_mov_b32_e32 v3, s1
958 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
959 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
960 ; VI-NEXT: flat_load_dword v0, v[0:1]
961 ; VI-NEXT: flat_load_dword v1, v[2:3]
962 ; VI-NEXT: s_mov_b32 s7, 0xf000
963 ; VI-NEXT: s_mov_b32 s6, -1
964 ; VI-NEXT: s_waitcnt vmcnt(0)
965 ; VI-NEXT: v_sub_u16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
966 ; VI-NEXT: v_sub_u16_e32 v0, v0, v1
967 ; VI-NEXT: v_bfe_i32 v0, v0, 0, 16
968 ; VI-NEXT: v_bfe_i32 v2, v2, 0, 16
969 ; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
970 ; VI-NEXT: v_ashrrev_i32_e32 v3, 31, v2
971 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
974 ; GFX10-LABEL: v_test_sub_v2i16_sext_to_v2i64:
976 ; GFX10-NEXT: s_clause 0x1
977 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
978 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
979 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
980 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
981 ; GFX10-NEXT: s_clause 0x1
982 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
983 ; GFX10-NEXT: global_load_dword v2, v0, s[2:3]
984 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
985 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000
986 ; GFX10-NEXT: s_mov_b32 s6, -1
987 ; GFX10-NEXT: s_waitcnt vmcnt(0)
988 ; GFX10-NEXT: v_pk_sub_i16 v0, v1, v2
989 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
990 ; GFX10-NEXT: v_bfe_i32 v0, v0, 0, 16
991 ; GFX10-NEXT: v_bfe_i32 v2, v1, 0, 16
992 ; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v0
993 ; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v2
994 ; GFX10-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
995 ; GFX10-NEXT: s_endpgm
997 ; GFX11-LABEL: v_test_sub_v2i16_sext_to_v2i64:
999 ; GFX11-NEXT: s_clause 0x1
1000 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
1001 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
1002 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1003 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1004 ; GFX11-NEXT: s_clause 0x1
1005 ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
1006 ; GFX11-NEXT: global_load_b32 v0, v0, s[0:1]
1007 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000
1008 ; GFX11-NEXT: s_mov_b32 s6, -1
1009 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1010 ; GFX11-NEXT: v_pk_sub_i16 v0, v1, v0
1011 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
1012 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
1013 ; GFX11-NEXT: v_bfe_i32 v0, v0, 0, 16
1014 ; GFX11-NEXT: v_bfe_i32 v2, v1, 0, 16
1015 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1016 ; GFX11-NEXT: v_ashrrev_i32_e32 v1, 31, v0
1017 ; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v2
1018 ; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0
1019 ; GFX11-NEXT: s_nop 0
1020 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1021 ; GFX11-NEXT: s_endpgm
1022 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1023 %gep.out = getelementptr inbounds <2 x i64>, ptr addrspace(1) %out, i32 %tid
1024 %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid
1025 %gep.in1 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in1, i32 %tid
1026 %a = load <2 x i16>, ptr addrspace(1) %gep.in0
1027 %b = load <2 x i16>, ptr addrspace(1) %gep.in1
1028 %add = sub <2 x i16> %a, %b
1029 %ext = sext <2 x i16> %add to <2 x i64>
1030 store <2 x i64> %ext, ptr addrspace(1) %out
1034 declare i32 @llvm.amdgcn.workitem.id.x() #0
1036 attributes #0 = { nounwind readnone }
1037 attributes #1 = { nounwind }