1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GCN,GFX9
3 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GCN,VI
4 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GFX10
6 ; FIXME: Need to handle non-uniform case for function below (load without gep).
7 define amdgpu_kernel void @v_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
8 ; GFX9-LABEL: v_test_sub_v2i16:
10 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
11 ; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
12 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
13 ; GFX9-NEXT: s_mov_b32 s3, 0xf000
14 ; GFX9-NEXT: s_mov_b32 s2, -1
15 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
16 ; GFX9-NEXT: s_mov_b32 s0, s4
17 ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
18 ; GFX9-NEXT: s_waitcnt vmcnt(0)
19 ; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc
20 ; GFX9-NEXT: s_waitcnt vmcnt(0)
21 ; GFX9-NEXT: s_mov_b32 s1, s5
22 ; GFX9-NEXT: v_pk_sub_i16 v0, v1, v2
23 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
26 ; VI-LABEL: v_test_sub_v2i16:
28 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
29 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
30 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
31 ; VI-NEXT: s_waitcnt lgkmcnt(0)
32 ; VI-NEXT: v_mov_b32_e32 v1, s7
33 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
34 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
35 ; VI-NEXT: v_mov_b32_e32 v3, s1
36 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
37 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
38 ; VI-NEXT: flat_load_dword v0, v[0:1] glc
39 ; VI-NEXT: s_waitcnt vmcnt(0)
40 ; VI-NEXT: flat_load_dword v1, v[2:3] glc
41 ; VI-NEXT: s_waitcnt vmcnt(0)
42 ; VI-NEXT: s_mov_b32 s7, 0xf000
43 ; VI-NEXT: s_mov_b32 s6, -1
44 ; VI-NEXT: v_sub_u16_e32 v2, v0, v1
45 ; VI-NEXT: v_sub_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
46 ; VI-NEXT: v_or_b32_e32 v0, v2, v0
47 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
50 ; GFX10-LABEL: v_test_sub_v2i16:
52 ; GFX10-NEXT: s_clause 0x1
53 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
54 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
55 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
56 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
57 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc
58 ; GFX10-NEXT: s_waitcnt vmcnt(0)
59 ; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc
60 ; GFX10-NEXT: s_waitcnt vmcnt(0)
61 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
62 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000
63 ; GFX10-NEXT: s_mov_b32 s6, -1
64 ; GFX10-NEXT: v_pk_sub_i16 v0, v1, v2
65 ; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0
66 ; GFX10-NEXT: s_endpgm
67 %tid = call i32 @llvm.amdgcn.workitem.id.x()
68 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
69 %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
70 %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid
71 %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
72 %b = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in1
73 %add = sub <2 x i16> %a, %b
74 store <2 x i16> %add, <2 x i16> addrspace(1)* %out
78 define amdgpu_kernel void @s_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %in0, <2 x i16> addrspace(4)* %in1) #1 {
79 ; GFX9-LABEL: s_test_sub_v2i16:
81 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
82 ; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
83 ; GFX9-NEXT: s_mov_b32 s3, 0xf000
84 ; GFX9-NEXT: s_mov_b32 s2, -1
85 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
86 ; GFX9-NEXT: s_mov_b32 s0, s4
87 ; GFX9-NEXT: s_mov_b32 s1, s5
88 ; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0
89 ; GFX9-NEXT: s_load_dword s5, s[8:9], 0x0
90 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
91 ; GFX9-NEXT: v_mov_b32_e32 v0, s5
92 ; GFX9-NEXT: v_pk_sub_i16 v0, s4, v0
93 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
96 ; VI-LABEL: s_test_sub_v2i16:
98 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
99 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
100 ; VI-NEXT: s_mov_b32 s3, 0xf000
101 ; VI-NEXT: s_mov_b32 s2, -1
102 ; VI-NEXT: s_waitcnt lgkmcnt(0)
103 ; VI-NEXT: s_mov_b32 s0, s4
104 ; VI-NEXT: s_load_dword s4, s[6:7], 0x0
105 ; VI-NEXT: s_load_dword s6, s[8:9], 0x0
106 ; VI-NEXT: s_mov_b32 s1, s5
107 ; VI-NEXT: s_waitcnt lgkmcnt(0)
108 ; VI-NEXT: s_lshr_b32 s5, s4, 16
109 ; VI-NEXT: s_lshr_b32 s7, s6, 16
110 ; VI-NEXT: s_sub_i32 s4, s4, s6
111 ; VI-NEXT: s_sub_i32 s5, s5, s7
112 ; VI-NEXT: s_and_b32 s4, s4, 0xffff
113 ; VI-NEXT: s_lshl_b32 s5, s5, 16
114 ; VI-NEXT: s_or_b32 s4, s4, s5
115 ; VI-NEXT: v_mov_b32_e32 v0, s4
116 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
119 ; GFX10-LABEL: s_test_sub_v2i16:
121 ; GFX10-NEXT: s_clause 0x1
122 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
123 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
124 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
125 ; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0
126 ; GFX10-NEXT: s_load_dword s1, s[2:3], 0x0
127 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000
128 ; GFX10-NEXT: s_mov_b32 s6, -1
129 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
130 ; GFX10-NEXT: v_pk_sub_i16 v0, s0, s1
131 ; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0
132 ; GFX10-NEXT: s_endpgm
133 %a = load <2 x i16>, <2 x i16> addrspace(4)* %in0
134 %b = load <2 x i16>, <2 x i16> addrspace(4)* %in1
135 %add = sub <2 x i16> %a, %b
136 store <2 x i16> %add, <2 x i16> addrspace(1)* %out
140 define amdgpu_kernel void @s_test_sub_self_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %in0) #1 {
141 ; GCN-LABEL: s_test_sub_self_v2i16:
143 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
144 ; GCN-NEXT: s_mov_b32 s3, 0xf000
145 ; GCN-NEXT: s_mov_b32 s2, -1
146 ; GCN-NEXT: v_mov_b32_e32 v0, 0
147 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
148 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
151 ; GFX10-LABEL: s_test_sub_self_v2i16:
153 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
154 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
155 ; GFX10-NEXT: s_mov_b32 s3, 0x31016000
156 ; GFX10-NEXT: s_mov_b32 s2, -1
157 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
158 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0
159 ; GFX10-NEXT: s_endpgm
160 %a = load <2 x i16>, <2 x i16> addrspace(4)* %in0
161 %add = sub <2 x i16> %a, %a
162 store <2 x i16> %add, <2 x i16> addrspace(1)* %out
166 ; FIXME: VI should not scalarize arg access.
167 define amdgpu_kernel void @s_test_sub_v2i16_kernarg(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #1 {
168 ; GFX9-LABEL: s_test_sub_v2i16_kernarg:
170 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
171 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
172 ; GFX9-NEXT: s_load_dword s3, s[0:1], 0x30
173 ; GFX9-NEXT: s_mov_b32 s7, 0xf000
174 ; GFX9-NEXT: s_mov_b32 s6, -1
175 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
176 ; GFX9-NEXT: v_mov_b32_e32 v0, s3
177 ; GFX9-NEXT: v_pk_sub_i16 v0, s2, v0
178 ; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
179 ; GFX9-NEXT: s_endpgm
181 ; VI-LABEL: s_test_sub_v2i16_kernarg:
183 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
184 ; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
185 ; VI-NEXT: s_load_dword s0, s[0:1], 0x30
186 ; VI-NEXT: s_mov_b32 s7, 0xf000
187 ; VI-NEXT: s_mov_b32 s6, -1
188 ; VI-NEXT: s_waitcnt lgkmcnt(0)
189 ; VI-NEXT: s_lshr_b32 s1, s2, 16
190 ; VI-NEXT: s_lshr_b32 s3, s0, 16
191 ; VI-NEXT: s_sub_i32 s1, s1, s3
192 ; VI-NEXT: s_sub_i32 s0, s2, s0
193 ; VI-NEXT: s_lshl_b32 s1, s1, 16
194 ; VI-NEXT: s_and_b32 s0, s0, 0xffff
195 ; VI-NEXT: s_or_b32 s0, s0, s1
196 ; VI-NEXT: v_mov_b32_e32 v0, s0
197 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
200 ; GFX10-LABEL: s_test_sub_v2i16_kernarg:
202 ; GFX10-NEXT: s_clause 0x2
203 ; GFX10-NEXT: s_load_dword s2, s[0:1], 0x2c
204 ; GFX10-NEXT: s_load_dword s3, s[0:1], 0x30
205 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
206 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000
207 ; GFX10-NEXT: s_mov_b32 s6, -1
208 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
209 ; GFX10-NEXT: v_pk_sub_i16 v0, s2, s3
210 ; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0
211 ; GFX10-NEXT: s_endpgm
212 %add = sub <2 x i16> %a, %b
213 store <2 x i16> %add, <2 x i16> addrspace(1)* %out
217 define amdgpu_kernel void @v_test_sub_v2i16_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
218 ; GFX9-LABEL: v_test_sub_v2i16_constant:
220 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
221 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
222 ; GFX9-NEXT: s_mov_b32 s3, 0xf000
223 ; GFX9-NEXT: s_mov_b32 s2, -1
224 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
225 ; GFX9-NEXT: global_load_dword v0, v0, s[6:7] glc
226 ; GFX9-NEXT: s_waitcnt vmcnt(0)
227 ; GFX9-NEXT: s_mov_b32 s0, s4
228 ; GFX9-NEXT: s_mov_b32 s4, 0x1c8007b
229 ; GFX9-NEXT: s_mov_b32 s1, s5
230 ; GFX9-NEXT: v_pk_sub_i16 v0, v0, s4
231 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
232 ; GFX9-NEXT: s_endpgm
234 ; VI-LABEL: v_test_sub_v2i16_constant:
236 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
237 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
238 ; VI-NEXT: s_waitcnt lgkmcnt(0)
239 ; VI-NEXT: v_mov_b32_e32 v1, s3
240 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
241 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
242 ; VI-NEXT: flat_load_dword v0, v[0:1] glc
243 ; VI-NEXT: s_waitcnt vmcnt(0)
244 ; VI-NEXT: v_mov_b32_e32 v1, 0xfffffe38
245 ; VI-NEXT: s_mov_b32 s3, 0xf000
246 ; VI-NEXT: s_mov_b32 s2, -1
247 ; VI-NEXT: v_add_u16_e32 v2, 0xff85, v0
248 ; VI-NEXT: v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
249 ; VI-NEXT: v_or_b32_e32 v0, v2, v0
250 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
253 ; GFX10-LABEL: v_test_sub_v2i16_constant:
255 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
256 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
257 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
258 ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc
259 ; GFX10-NEXT: s_waitcnt vmcnt(0)
260 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
261 ; GFX10-NEXT: s_mov_b32 s3, 0x31016000
262 ; GFX10-NEXT: s_mov_b32 s2, -1
263 ; GFX10-NEXT: v_pk_sub_i16 v0, v0, 0x1c8007b
264 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0
265 ; GFX10-NEXT: s_endpgm
266 %tid = call i32 @llvm.amdgcn.workitem.id.x()
267 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
268 %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
269 %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
270 %add = sub <2 x i16> %a, <i16 123, i16 456>
271 store <2 x i16> %add, <2 x i16> addrspace(1)* %out
275 ; FIXME: Need to handle non-uniform case for function below (load without gep).
276 define amdgpu_kernel void @v_test_sub_v2i16_neg_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
277 ; GFX9-LABEL: v_test_sub_v2i16_neg_constant:
279 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
280 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
281 ; GFX9-NEXT: s_mov_b32 s3, 0xf000
282 ; GFX9-NEXT: s_mov_b32 s2, -1
283 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
284 ; GFX9-NEXT: global_load_dword v0, v0, s[6:7] glc
285 ; GFX9-NEXT: s_waitcnt vmcnt(0)
286 ; GFX9-NEXT: s_mov_b32 s0, s4
287 ; GFX9-NEXT: s_mov_b32 s4, 0xfc21fcb3
288 ; GFX9-NEXT: s_mov_b32 s1, s5
289 ; GFX9-NEXT: v_pk_sub_i16 v0, v0, s4
290 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
291 ; GFX9-NEXT: s_endpgm
293 ; VI-LABEL: v_test_sub_v2i16_neg_constant:
295 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
296 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
297 ; VI-NEXT: s_waitcnt lgkmcnt(0)
298 ; VI-NEXT: v_mov_b32_e32 v1, s3
299 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
300 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
301 ; VI-NEXT: flat_load_dword v0, v[0:1] glc
302 ; VI-NEXT: s_waitcnt vmcnt(0)
303 ; VI-NEXT: v_mov_b32_e32 v1, 0x3df
304 ; VI-NEXT: s_mov_b32 s3, 0xf000
305 ; VI-NEXT: s_mov_b32 s2, -1
306 ; VI-NEXT: v_add_u16_e32 v2, 0x34d, v0
307 ; VI-NEXT: v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
308 ; VI-NEXT: v_or_b32_e32 v0, v2, v0
309 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
312 ; GFX10-LABEL: v_test_sub_v2i16_neg_constant:
314 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
315 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
316 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
317 ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc
318 ; GFX10-NEXT: s_waitcnt vmcnt(0)
319 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
320 ; GFX10-NEXT: s_mov_b32 s3, 0x31016000
321 ; GFX10-NEXT: s_mov_b32 s2, -1
322 ; GFX10-NEXT: v_pk_sub_i16 v0, v0, 0xfc21fcb3
323 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0
324 ; GFX10-NEXT: s_endpgm
325 %tid = call i32 @llvm.amdgcn.workitem.id.x()
326 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
327 %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
328 %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
329 %add = sub <2 x i16> %a, <i16 -845, i16 -991>
330 store <2 x i16> %add, <2 x i16> addrspace(1)* %out
334 define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
335 ; GFX9-LABEL: v_test_sub_v2i16_inline_neg1:
337 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
338 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
339 ; GFX9-NEXT: s_mov_b32 s3, 0xf000
340 ; GFX9-NEXT: s_mov_b32 s2, -1
341 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
342 ; GFX9-NEXT: global_load_dword v0, v0, s[6:7] glc
343 ; GFX9-NEXT: s_waitcnt vmcnt(0)
344 ; GFX9-NEXT: s_mov_b32 s0, s4
345 ; GFX9-NEXT: s_mov_b32 s1, s5
346 ; GFX9-NEXT: v_pk_sub_i16 v0, v0, -1 op_sel_hi:[1,0]
347 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
348 ; GFX9-NEXT: s_endpgm
350 ; VI-LABEL: v_test_sub_v2i16_inline_neg1:
352 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
353 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
354 ; VI-NEXT: s_waitcnt lgkmcnt(0)
355 ; VI-NEXT: v_mov_b32_e32 v1, s3
356 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
357 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
358 ; VI-NEXT: flat_load_dword v0, v[0:1] glc
359 ; VI-NEXT: s_waitcnt vmcnt(0)
360 ; VI-NEXT: v_mov_b32_e32 v1, 1
361 ; VI-NEXT: s_mov_b32 s3, 0xf000
362 ; VI-NEXT: s_mov_b32 s2, -1
363 ; VI-NEXT: v_add_u16_e32 v2, 1, v0
364 ; VI-NEXT: v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
365 ; VI-NEXT: v_or_b32_e32 v0, v2, v0
366 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
369 ; GFX10-LABEL: v_test_sub_v2i16_inline_neg1:
371 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
372 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
373 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
374 ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc
375 ; GFX10-NEXT: s_waitcnt vmcnt(0)
376 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
377 ; GFX10-NEXT: s_mov_b32 s3, 0x31016000
378 ; GFX10-NEXT: s_mov_b32 s2, -1
379 ; GFX10-NEXT: v_pk_sub_i16 v0, v0, -1 op_sel_hi:[1,0]
380 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0
381 ; GFX10-NEXT: s_endpgm
382 %tid = call i32 @llvm.amdgcn.workitem.id.x()
383 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
384 %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
385 %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
386 %add = sub <2 x i16> %a, <i16 -1, i16 -1>
387 store <2 x i16> %add, <2 x i16> addrspace(1)* %out
391 define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
392 ; GFX9-LABEL: v_test_sub_v2i16_inline_lo_zero_hi:
394 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
395 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
396 ; GFX9-NEXT: s_mov_b32 s3, 0xf000
397 ; GFX9-NEXT: s_mov_b32 s2, -1
398 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
399 ; GFX9-NEXT: global_load_dword v0, v0, s[6:7] glc
400 ; GFX9-NEXT: s_waitcnt vmcnt(0)
401 ; GFX9-NEXT: s_mov_b32 s0, s4
402 ; GFX9-NEXT: s_mov_b32 s1, s5
403 ; GFX9-NEXT: v_pk_sub_i16 v0, v0, 32
404 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
405 ; GFX9-NEXT: s_endpgm
407 ; VI-LABEL: v_test_sub_v2i16_inline_lo_zero_hi:
409 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
410 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
411 ; VI-NEXT: s_waitcnt lgkmcnt(0)
412 ; VI-NEXT: v_mov_b32_e32 v1, s3
413 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
414 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
415 ; VI-NEXT: flat_load_dword v0, v[0:1] glc
416 ; VI-NEXT: s_waitcnt vmcnt(0)
417 ; VI-NEXT: s_mov_b32 s3, 0xf000
418 ; VI-NEXT: s_mov_b32 s2, -1
419 ; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
420 ; VI-NEXT: v_subrev_u16_e32 v0, 32, v0
421 ; VI-NEXT: v_or_b32_e32 v0, v0, v1
422 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
425 ; GFX10-LABEL: v_test_sub_v2i16_inline_lo_zero_hi:
427 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
428 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
429 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
430 ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc
431 ; GFX10-NEXT: s_waitcnt vmcnt(0)
432 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
433 ; GFX10-NEXT: s_mov_b32 s3, 0x31016000
434 ; GFX10-NEXT: s_mov_b32 s2, -1
435 ; GFX10-NEXT: v_pk_sub_i16 v0, v0, 32
436 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0
437 ; GFX10-NEXT: s_endpgm
438 %tid = call i32 @llvm.amdgcn.workitem.id.x()
439 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
440 %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
441 %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
442 %add = sub <2 x i16> %a, <i16 32, i16 0>
443 store <2 x i16> %add, <2 x i16> addrspace(1)* %out
447 ; The high element gives fp
448 define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
449 ; GFX9-LABEL: v_test_sub_v2i16_inline_fp_split:
451 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
452 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
453 ; GFX9-NEXT: s_mov_b32 s3, 0xf000
454 ; GFX9-NEXT: s_mov_b32 s2, -1
455 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
456 ; GFX9-NEXT: global_load_dword v0, v0, s[6:7] glc
457 ; GFX9-NEXT: s_waitcnt vmcnt(0)
458 ; GFX9-NEXT: s_mov_b32 s0, s4
459 ; GFX9-NEXT: s_mov_b32 s4, 1.0
460 ; GFX9-NEXT: s_mov_b32 s1, s5
461 ; GFX9-NEXT: v_pk_sub_i16 v0, v0, s4
462 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
463 ; GFX9-NEXT: s_endpgm
465 ; VI-LABEL: v_test_sub_v2i16_inline_fp_split:
467 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
468 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
469 ; VI-NEXT: s_mov_b32 s3, 0xf000
470 ; VI-NEXT: s_mov_b32 s2, -1
471 ; VI-NEXT: s_waitcnt lgkmcnt(0)
472 ; VI-NEXT: v_mov_b32_e32 v1, s7
473 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
474 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
475 ; VI-NEXT: flat_load_dword v0, v[0:1] glc
476 ; VI-NEXT: s_waitcnt vmcnt(0)
477 ; VI-NEXT: v_mov_b32_e32 v1, 0xffffc080
478 ; VI-NEXT: s_mov_b32 s0, s4
479 ; VI-NEXT: s_mov_b32 s1, s5
480 ; VI-NEXT: v_add_u16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
481 ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
482 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
485 ; GFX10-LABEL: v_test_sub_v2i16_inline_fp_split:
487 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
488 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
489 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
490 ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc
491 ; GFX10-NEXT: s_waitcnt vmcnt(0)
492 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
493 ; GFX10-NEXT: s_mov_b32 s3, 0x31016000
494 ; GFX10-NEXT: s_mov_b32 s2, -1
495 ; GFX10-NEXT: v_pk_sub_i16 v0, v0, 0x3f80 op_sel:[0,1] op_sel_hi:[1,0]
496 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0
497 ; GFX10-NEXT: s_endpgm
498 %tid = call i32 @llvm.amdgcn.workitem.id.x()
499 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
500 %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
501 %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
502 %add = sub <2 x i16> %a, <i16 0, i16 16256>
503 store <2 x i16> %add, <2 x i16> addrspace(1)* %out
507 ; FIXME: Need to handle non-uniform case for function below (load without gep).
508 define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
509 ; GFX9-LABEL: v_test_sub_v2i16_zext_to_v2i32:
511 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
512 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
513 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
514 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
515 ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
516 ; GFX9-NEXT: s_waitcnt vmcnt(0)
517 ; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc
518 ; GFX9-NEXT: s_waitcnt vmcnt(0)
519 ; GFX9-NEXT: s_mov_b32 s7, 0xf000
520 ; GFX9-NEXT: s_mov_b32 s6, -1
521 ; GFX9-NEXT: v_pk_sub_i16 v0, v1, v2
522 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
523 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
524 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
525 ; GFX9-NEXT: s_endpgm
527 ; VI-LABEL: v_test_sub_v2i16_zext_to_v2i32:
529 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
530 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
531 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
532 ; VI-NEXT: s_waitcnt lgkmcnt(0)
533 ; VI-NEXT: v_mov_b32_e32 v1, s7
534 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
535 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
536 ; VI-NEXT: v_mov_b32_e32 v3, s1
537 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
538 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
539 ; VI-NEXT: flat_load_dword v1, v[0:1] glc
540 ; VI-NEXT: s_waitcnt vmcnt(0)
541 ; VI-NEXT: flat_load_dword v2, v[2:3] glc
542 ; VI-NEXT: s_waitcnt vmcnt(0)
543 ; VI-NEXT: s_mov_b32 s7, 0xf000
544 ; VI-NEXT: s_mov_b32 s6, -1
545 ; VI-NEXT: v_sub_u16_e32 v0, v1, v2
546 ; VI-NEXT: v_sub_u16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
547 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
550 ; GFX10-LABEL: v_test_sub_v2i16_zext_to_v2i32:
552 ; GFX10-NEXT: s_clause 0x1
553 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
554 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
555 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
556 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
557 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc
558 ; GFX10-NEXT: s_waitcnt vmcnt(0)
559 ; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc
560 ; GFX10-NEXT: s_waitcnt vmcnt(0)
561 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
562 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000
563 ; GFX10-NEXT: s_mov_b32 s6, -1
564 ; GFX10-NEXT: v_pk_sub_i16 v0, v1, v2
565 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
566 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
567 ; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
568 ; GFX10-NEXT: s_endpgm
569 %tid = call i32 @llvm.amdgcn.workitem.id.x()
570 %gep.out = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %out, i32 %tid
571 %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
572 %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid
573 %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
574 %b = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in1
575 %add = sub <2 x i16> %a, %b
576 %ext = zext <2 x i16> %add to <2 x i32>
577 store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
581 ; FIXME: Need to handle non-uniform case for function below (load without gep).
582 define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
583 ; GFX9-LABEL: v_test_sub_v2i16_zext_to_v2i64:
585 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
586 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
587 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
588 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
589 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
590 ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] glc
591 ; GFX9-NEXT: s_waitcnt vmcnt(0)
592 ; GFX9-NEXT: global_load_dword v3, v0, s[2:3] glc
593 ; GFX9-NEXT: s_waitcnt vmcnt(0)
594 ; GFX9-NEXT: s_mov_b32 s7, 0xf000
595 ; GFX9-NEXT: s_mov_b32 s6, -1
596 ; GFX9-NEXT: v_pk_sub_i16 v2, v2, v3
597 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2
598 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
599 ; GFX9-NEXT: v_mov_b32_e32 v3, v1
600 ; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
601 ; GFX9-NEXT: s_endpgm
603 ; VI-LABEL: v_test_sub_v2i16_zext_to_v2i64:
605 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
606 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
607 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
608 ; VI-NEXT: s_waitcnt lgkmcnt(0)
609 ; VI-NEXT: v_mov_b32_e32 v1, s7
610 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
611 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
612 ; VI-NEXT: v_mov_b32_e32 v3, s1
613 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
614 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
615 ; VI-NEXT: flat_load_dword v4, v[0:1] glc
616 ; VI-NEXT: s_waitcnt vmcnt(0)
617 ; VI-NEXT: flat_load_dword v2, v[2:3] glc
618 ; VI-NEXT: s_waitcnt vmcnt(0)
619 ; VI-NEXT: v_mov_b32_e32 v1, 0
620 ; VI-NEXT: s_mov_b32 s7, 0xf000
621 ; VI-NEXT: s_mov_b32 s6, -1
622 ; VI-NEXT: v_mov_b32_e32 v3, v1
623 ; VI-NEXT: v_sub_u16_e32 v0, v4, v2
624 ; VI-NEXT: v_sub_u16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
625 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
628 ; GFX10-LABEL: v_test_sub_v2i16_zext_to_v2i64:
630 ; GFX10-NEXT: s_clause 0x1
631 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
632 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
633 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
634 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
635 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc
636 ; GFX10-NEXT: s_waitcnt vmcnt(0)
637 ; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc
638 ; GFX10-NEXT: s_waitcnt vmcnt(0)
639 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
640 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000
641 ; GFX10-NEXT: s_mov_b32 s6, -1
642 ; GFX10-NEXT: v_pk_sub_i16 v2, v1, v2
643 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
644 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v2
645 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2
646 ; GFX10-NEXT: v_mov_b32_e32 v3, v1
647 ; GFX10-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
648 ; GFX10-NEXT: s_endpgm
649 %tid = call i32 @llvm.amdgcn.workitem.id.x()
650 %gep.out = getelementptr inbounds <2 x i64>, <2 x i64> addrspace(1)* %out, i32 %tid
651 %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
652 %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid
653 %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
654 %b = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in1
655 %add = sub <2 x i16> %a, %b
656 %ext = zext <2 x i16> %add to <2 x i64>
657 store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
661 ; FIXME: Need to handle non-uniform case for function below (load without gep).
662 define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
663 ; GFX9-LABEL: v_test_sub_v2i16_sext_to_v2i32:
665 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
666 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
667 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
668 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
669 ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
670 ; GFX9-NEXT: s_waitcnt vmcnt(0)
671 ; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc
672 ; GFX9-NEXT: s_waitcnt vmcnt(0)
673 ; GFX9-NEXT: s_mov_b32 s7, 0xf000
674 ; GFX9-NEXT: s_mov_b32 s6, -1
675 ; GFX9-NEXT: v_pk_sub_i16 v0, v1, v2
676 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 16, v0
677 ; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 16
678 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
679 ; GFX9-NEXT: s_endpgm
681 ; VI-LABEL: v_test_sub_v2i16_sext_to_v2i32:
683 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
684 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
685 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
686 ; VI-NEXT: s_waitcnt lgkmcnt(0)
687 ; VI-NEXT: v_mov_b32_e32 v1, s7
688 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
689 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
690 ; VI-NEXT: v_mov_b32_e32 v3, s1
691 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
692 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
693 ; VI-NEXT: flat_load_dword v0, v[0:1] glc
694 ; VI-NEXT: s_waitcnt vmcnt(0)
695 ; VI-NEXT: flat_load_dword v1, v[2:3] glc
696 ; VI-NEXT: s_waitcnt vmcnt(0)
697 ; VI-NEXT: s_mov_b32 s7, 0xf000
698 ; VI-NEXT: s_mov_b32 s6, -1
699 ; VI-NEXT: v_sub_u16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
700 ; VI-NEXT: v_sub_u16_e32 v0, v0, v1
701 ; VI-NEXT: v_bfe_i32 v0, v0, 0, 16
702 ; VI-NEXT: v_bfe_i32 v1, v2, 0, 16
703 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
706 ; GFX10-LABEL: v_test_sub_v2i16_sext_to_v2i32:
708 ; GFX10-NEXT: s_clause 0x1
709 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
710 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
711 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
712 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
713 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc
714 ; GFX10-NEXT: s_waitcnt vmcnt(0)
715 ; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc
716 ; GFX10-NEXT: s_waitcnt vmcnt(0)
717 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
718 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000
719 ; GFX10-NEXT: s_mov_b32 s6, -1
720 ; GFX10-NEXT: v_pk_sub_i16 v0, v1, v2
721 ; GFX10-NEXT: v_ashrrev_i32_e32 v1, 16, v0
722 ; GFX10-NEXT: v_bfe_i32 v0, v0, 0, 16
723 ; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
724 ; GFX10-NEXT: s_endpgm
725 %tid = call i32 @llvm.amdgcn.workitem.id.x()
726 %gep.out = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %out, i32 %tid
727 %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
728 %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid
729 %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
730 %b = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in1
731 %add = sub <2 x i16> %a, %b
732 %ext = sext <2 x i16> %add to <2 x i32>
733 store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
737 ; FIXME: Need to handle non-uniform case for function below (load without gep).
738 define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
739 ; GFX9-LABEL: v_test_sub_v2i16_sext_to_v2i64:
741 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
742 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
743 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
744 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
745 ; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
746 ; GFX9-NEXT: global_load_dword v2, v0, s[2:3]
747 ; GFX9-NEXT: s_mov_b32 s7, 0xf000
748 ; GFX9-NEXT: s_mov_b32 s6, -1
749 ; GFX9-NEXT: s_waitcnt vmcnt(0)
750 ; GFX9-NEXT: v_pk_sub_i16 v1, v1, v2
751 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v1
752 ; GFX9-NEXT: v_bfe_i32 v0, v1, 0, 16
753 ; GFX9-NEXT: v_bfe_i32 v2, v2, 0, 16
754 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0
755 ; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v2
756 ; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
757 ; GFX9-NEXT: s_endpgm
759 ; VI-LABEL: v_test_sub_v2i16_sext_to_v2i64:
761 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
762 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
763 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
764 ; VI-NEXT: s_waitcnt lgkmcnt(0)
765 ; VI-NEXT: v_mov_b32_e32 v1, s7
766 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
767 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
768 ; VI-NEXT: v_mov_b32_e32 v3, s1
769 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
770 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
771 ; VI-NEXT: flat_load_dword v0, v[0:1]
772 ; VI-NEXT: flat_load_dword v1, v[2:3]
773 ; VI-NEXT: s_mov_b32 s7, 0xf000
774 ; VI-NEXT: s_mov_b32 s6, -1
775 ; VI-NEXT: s_waitcnt vmcnt(0)
776 ; VI-NEXT: v_sub_u16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
777 ; VI-NEXT: v_sub_u16_e32 v0, v0, v1
778 ; VI-NEXT: v_bfe_i32 v0, v0, 0, 16
779 ; VI-NEXT: v_bfe_i32 v2, v2, 0, 16
780 ; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
781 ; VI-NEXT: v_ashrrev_i32_e32 v3, 31, v2
782 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
785 ; GFX10-LABEL: v_test_sub_v2i16_sext_to_v2i64:
787 ; GFX10-NEXT: s_clause 0x1
788 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
789 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
790 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
791 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
792 ; GFX10-NEXT: s_clause 0x1
793 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
794 ; GFX10-NEXT: global_load_dword v2, v0, s[2:3]
795 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
796 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000
797 ; GFX10-NEXT: s_mov_b32 s6, -1
798 ; GFX10-NEXT: s_waitcnt vmcnt(0)
799 ; GFX10-NEXT: v_pk_sub_i16 v0, v1, v2
800 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
801 ; GFX10-NEXT: v_bfe_i32 v0, v0, 0, 16
802 ; GFX10-NEXT: v_bfe_i32 v2, v1, 0, 16
803 ; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v0
804 ; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v2
805 ; GFX10-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
806 ; GFX10-NEXT: s_endpgm
807 %tid = call i32 @llvm.amdgcn.workitem.id.x()
808 %gep.out = getelementptr inbounds <2 x i64>, <2 x i64> addrspace(1)* %out, i32 %tid
809 %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
810 %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid
811 %a = load <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
812 %b = load <2 x i16>, <2 x i16> addrspace(1)* %gep.in1
813 %add = sub <2 x i16> %a, %b
814 %ext = sext <2 x i16> %add to <2 x i64>
815 store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
819 declare i32 @llvm.amdgcn.workitem.id.x() #0
821 attributes #0 = { nounwind readnone }
822 attributes #1 = { nounwind }