1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck %s --check-prefix=SI
3 ; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=VI
4 ; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck %s --check-prefix=GFX9
5 ; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s --check-prefix=GFX10
6 ; RUN: llc < %s -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck %s --check-prefix=GFX11
8 ; Test that add/sub with a constant is swapped to sub/add with negated
9 ; constant to minimize code size.
11 define amdgpu_kernel void @v_test_i32_x_sub_64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
12 ; SI-LABEL: v_test_i32_x_sub_64:
14 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
15 ; SI-NEXT: s_mov_b32 s7, 0xf000
16 ; SI-NEXT: s_mov_b32 s6, 0
17 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
18 ; SI-NEXT: v_mov_b32_e32 v1, 0
19 ; SI-NEXT: s_waitcnt lgkmcnt(0)
20 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
21 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
22 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
23 ; SI-NEXT: s_waitcnt vmcnt(0)
24 ; SI-NEXT: v_subrev_i32_e32 v2, vcc, 64, v2
25 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
28 ; VI-LABEL: v_test_i32_x_sub_64:
30 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
31 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
32 ; VI-NEXT: s_waitcnt lgkmcnt(0)
33 ; VI-NEXT: v_mov_b32_e32 v1, s3
34 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
35 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
36 ; VI-NEXT: flat_load_dword v3, v[0:1]
37 ; VI-NEXT: v_mov_b32_e32 v1, s1
38 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
39 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
40 ; VI-NEXT: s_waitcnt vmcnt(0)
41 ; VI-NEXT: v_subrev_u32_e32 v2, vcc, 64, v3
42 ; VI-NEXT: flat_store_dword v[0:1], v2
45 ; GFX9-LABEL: v_test_i32_x_sub_64:
47 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
48 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
49 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
50 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
51 ; GFX9-NEXT: s_waitcnt vmcnt(0)
52 ; GFX9-NEXT: v_subrev_u32_e32 v1, 64, v1
53 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
56 ; GFX10-LABEL: v_test_i32_x_sub_64:
58 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
59 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
60 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
61 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
62 ; GFX10-NEXT: s_waitcnt vmcnt(0)
63 ; GFX10-NEXT: v_subrev_nc_u32_e32 v1, 64, v1
64 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
65 ; GFX10-NEXT: s_endpgm
67 ; GFX11-LABEL: v_test_i32_x_sub_64:
69 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
70 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
71 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
72 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
73 ; GFX11-NEXT: s_waitcnt vmcnt(0)
74 ; GFX11-NEXT: v_subrev_nc_u32_e32 v1, 64, v1
75 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
77 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
78 ; GFX11-NEXT: s_endpgm
79 %tid = call i32 @llvm.amdgcn.workitem.id.x()
80 %tid.ext = sext i32 %tid to i64
81 %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %tid.ext
82 %gep.out = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %tid.ext
83 %x = load i32, ptr addrspace(1) %gep
84 %result = sub i32 %x, 64
85 store i32 %result, ptr addrspace(1) %gep.out
89 define amdgpu_kernel void @v_test_i32_x_sub_64_multi_use(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
90 ; SI-LABEL: v_test_i32_x_sub_64_multi_use:
92 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
93 ; SI-NEXT: s_mov_b32 s7, 0xf000
94 ; SI-NEXT: s_mov_b32 s6, 0
95 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
96 ; SI-NEXT: v_mov_b32_e32 v1, 0
97 ; SI-NEXT: s_waitcnt lgkmcnt(0)
98 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
99 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
100 ; SI-NEXT: s_waitcnt vmcnt(0)
101 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc
102 ; SI-NEXT: s_waitcnt vmcnt(0)
103 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
104 ; SI-NEXT: v_subrev_i32_e32 v2, vcc, 64, v2
105 ; SI-NEXT: v_subrev_i32_e32 v3, vcc, 64, v3
106 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
107 ; SI-NEXT: s_waitcnt vmcnt(0)
108 ; SI-NEXT: buffer_store_dword v3, v[0:1], s[0:3], 0 addr64
109 ; SI-NEXT: s_waitcnt vmcnt(0)
112 ; VI-LABEL: v_test_i32_x_sub_64_multi_use:
114 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
115 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
116 ; VI-NEXT: s_waitcnt lgkmcnt(0)
117 ; VI-NEXT: v_mov_b32_e32 v1, s3
118 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
119 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
120 ; VI-NEXT: flat_load_dword v3, v[0:1] glc
121 ; VI-NEXT: s_waitcnt vmcnt(0)
122 ; VI-NEXT: flat_load_dword v4, v[0:1] glc
123 ; VI-NEXT: s_waitcnt vmcnt(0)
124 ; VI-NEXT: v_mov_b32_e32 v1, s1
125 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
126 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
127 ; VI-NEXT: v_subrev_u32_e32 v2, vcc, 64, v3
128 ; VI-NEXT: v_subrev_u32_e32 v3, vcc, 64, v4
129 ; VI-NEXT: flat_store_dword v[0:1], v2
130 ; VI-NEXT: s_waitcnt vmcnt(0)
131 ; VI-NEXT: flat_store_dword v[0:1], v3
132 ; VI-NEXT: s_waitcnt vmcnt(0)
135 ; GFX9-LABEL: v_test_i32_x_sub_64_multi_use:
137 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
138 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
139 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
140 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
141 ; GFX9-NEXT: s_waitcnt vmcnt(0)
142 ; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc
143 ; GFX9-NEXT: s_waitcnt vmcnt(0)
144 ; GFX9-NEXT: v_subrev_u32_e32 v1, 64, v1
145 ; GFX9-NEXT: v_subrev_u32_e32 v2, 64, v2
146 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
147 ; GFX9-NEXT: s_waitcnt vmcnt(0)
148 ; GFX9-NEXT: global_store_dword v0, v2, s[0:1]
149 ; GFX9-NEXT: s_waitcnt vmcnt(0)
150 ; GFX9-NEXT: s_endpgm
152 ; GFX10-LABEL: v_test_i32_x_sub_64_multi_use:
154 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
155 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
156 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
157 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc
158 ; GFX10-NEXT: s_waitcnt vmcnt(0)
159 ; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc
160 ; GFX10-NEXT: s_waitcnt vmcnt(0)
161 ; GFX10-NEXT: v_subrev_nc_u32_e32 v1, 64, v1
162 ; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 64, v2
163 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
164 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
165 ; GFX10-NEXT: global_store_dword v0, v2, s[0:1]
166 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
167 ; GFX10-NEXT: s_endpgm
169 ; GFX11-LABEL: v_test_i32_x_sub_64_multi_use:
171 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
172 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
173 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
174 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
175 ; GFX11-NEXT: s_waitcnt vmcnt(0)
176 ; GFX11-NEXT: global_load_b32 v2, v0, s[2:3] glc dlc
177 ; GFX11-NEXT: s_waitcnt vmcnt(0)
178 ; GFX11-NEXT: v_subrev_nc_u32_e32 v1, 64, v1
179 ; GFX11-NEXT: v_subrev_nc_u32_e32 v2, 64, v2
180 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] dlc
181 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
182 ; GFX11-NEXT: global_store_b32 v0, v2, s[0:1] dlc
183 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
184 ; GFX11-NEXT: s_nop 0
185 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
186 ; GFX11-NEXT: s_endpgm
187 %tid = call i32 @llvm.amdgcn.workitem.id.x()
188 %tid.ext = sext i32 %tid to i64
189 %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %tid.ext
190 %gep.out = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %tid.ext
191 %x = load volatile i32, ptr addrspace(1) %gep
192 %y = load volatile i32, ptr addrspace(1) %gep
193 %result0 = sub i32 %x, 64
194 %result1 = sub i32 %y, 64
195 store volatile i32 %result0, ptr addrspace(1) %gep.out
196 store volatile i32 %result1, ptr addrspace(1) %gep.out
200 define amdgpu_kernel void @v_test_i32_64_sub_x(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
201 ; SI-LABEL: v_test_i32_64_sub_x:
203 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
204 ; SI-NEXT: s_mov_b32 s7, 0xf000
205 ; SI-NEXT: s_mov_b32 s6, 0
206 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
207 ; SI-NEXT: v_mov_b32_e32 v1, 0
208 ; SI-NEXT: s_waitcnt lgkmcnt(0)
209 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
210 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
211 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
212 ; SI-NEXT: s_waitcnt vmcnt(0)
213 ; SI-NEXT: v_sub_i32_e32 v2, vcc, 64, v2
214 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
217 ; VI-LABEL: v_test_i32_64_sub_x:
219 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
220 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
221 ; VI-NEXT: s_waitcnt lgkmcnt(0)
222 ; VI-NEXT: v_mov_b32_e32 v1, s3
223 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
224 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
225 ; VI-NEXT: flat_load_dword v3, v[0:1]
226 ; VI-NEXT: v_mov_b32_e32 v1, s1
227 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
228 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
229 ; VI-NEXT: s_waitcnt vmcnt(0)
230 ; VI-NEXT: v_sub_u32_e32 v2, vcc, 64, v3
231 ; VI-NEXT: flat_store_dword v[0:1], v2
234 ; GFX9-LABEL: v_test_i32_64_sub_x:
236 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
237 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
238 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
239 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
240 ; GFX9-NEXT: s_waitcnt vmcnt(0)
241 ; GFX9-NEXT: v_sub_u32_e32 v1, 64, v1
242 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
243 ; GFX9-NEXT: s_endpgm
245 ; GFX10-LABEL: v_test_i32_64_sub_x:
247 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
248 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
249 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
250 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
251 ; GFX10-NEXT: s_waitcnt vmcnt(0)
252 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, 64, v1
253 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
254 ; GFX10-NEXT: s_endpgm
256 ; GFX11-LABEL: v_test_i32_64_sub_x:
258 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
259 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
260 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
261 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
262 ; GFX11-NEXT: s_waitcnt vmcnt(0)
263 ; GFX11-NEXT: v_sub_nc_u32_e32 v1, 64, v1
264 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
265 ; GFX11-NEXT: s_nop 0
266 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
267 ; GFX11-NEXT: s_endpgm
268 %tid = call i32 @llvm.amdgcn.workitem.id.x()
269 %tid.ext = sext i32 %tid to i64
270 %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %tid.ext
271 %gep.out = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %tid.ext
272 %x = load i32, ptr addrspace(1) %gep
273 %result = sub i32 64, %x
274 store i32 %result, ptr addrspace(1) %gep.out
278 define amdgpu_kernel void @v_test_i32_x_sub_65(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
279 ; SI-LABEL: v_test_i32_x_sub_65:
281 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
282 ; SI-NEXT: s_mov_b32 s7, 0xf000
283 ; SI-NEXT: s_mov_b32 s6, 0
284 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
285 ; SI-NEXT: v_mov_b32_e32 v1, 0
286 ; SI-NEXT: s_waitcnt lgkmcnt(0)
287 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
288 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
289 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
290 ; SI-NEXT: s_waitcnt vmcnt(0)
291 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0xffffffbf, v2
292 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
295 ; VI-LABEL: v_test_i32_x_sub_65:
297 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
298 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
299 ; VI-NEXT: s_waitcnt lgkmcnt(0)
300 ; VI-NEXT: v_mov_b32_e32 v1, s3
301 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
302 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
303 ; VI-NEXT: flat_load_dword v3, v[0:1]
304 ; VI-NEXT: v_mov_b32_e32 v1, s1
305 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
306 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
307 ; VI-NEXT: s_waitcnt vmcnt(0)
308 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0xffffffbf, v3
309 ; VI-NEXT: flat_store_dword v[0:1], v2
312 ; GFX9-LABEL: v_test_i32_x_sub_65:
314 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
315 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
316 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
317 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
318 ; GFX9-NEXT: s_waitcnt vmcnt(0)
319 ; GFX9-NEXT: v_add_u32_e32 v1, 0xffffffbf, v1
320 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
321 ; GFX9-NEXT: s_endpgm
323 ; GFX10-LABEL: v_test_i32_x_sub_65:
325 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
326 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
327 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
328 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
329 ; GFX10-NEXT: s_waitcnt vmcnt(0)
330 ; GFX10-NEXT: v_add_nc_u32_e32 v1, 0xffffffbf, v1
331 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
332 ; GFX10-NEXT: s_endpgm
334 ; GFX11-LABEL: v_test_i32_x_sub_65:
336 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
337 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
338 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
339 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
340 ; GFX11-NEXT: s_waitcnt vmcnt(0)
341 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 0xffffffbf, v1
342 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
343 ; GFX11-NEXT: s_nop 0
344 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
345 ; GFX11-NEXT: s_endpgm
346 %tid = call i32 @llvm.amdgcn.workitem.id.x()
347 %tid.ext = sext i32 %tid to i64
348 %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %tid.ext
349 %gep.out = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %tid.ext
350 %x = load i32, ptr addrspace(1) %gep
351 %result = sub i32 %x, 65
352 store i32 %result, ptr addrspace(1) %gep.out
356 define amdgpu_kernel void @v_test_i32_65_sub_x(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
357 ; SI-LABEL: v_test_i32_65_sub_x:
359 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
360 ; SI-NEXT: s_mov_b32 s7, 0xf000
361 ; SI-NEXT: s_mov_b32 s6, 0
362 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
363 ; SI-NEXT: v_mov_b32_e32 v1, 0
364 ; SI-NEXT: s_waitcnt lgkmcnt(0)
365 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
366 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
367 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
368 ; SI-NEXT: s_waitcnt vmcnt(0)
369 ; SI-NEXT: v_sub_i32_e32 v2, vcc, 0x41, v2
370 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
373 ; VI-LABEL: v_test_i32_65_sub_x:
375 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
376 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
377 ; VI-NEXT: s_waitcnt lgkmcnt(0)
378 ; VI-NEXT: v_mov_b32_e32 v1, s3
379 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
380 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
381 ; VI-NEXT: flat_load_dword v3, v[0:1]
382 ; VI-NEXT: v_mov_b32_e32 v1, s1
383 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
384 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
385 ; VI-NEXT: s_waitcnt vmcnt(0)
386 ; VI-NEXT: v_sub_u32_e32 v2, vcc, 0x41, v3
387 ; VI-NEXT: flat_store_dword v[0:1], v2
390 ; GFX9-LABEL: v_test_i32_65_sub_x:
392 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
393 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
394 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
395 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
396 ; GFX9-NEXT: s_waitcnt vmcnt(0)
397 ; GFX9-NEXT: v_sub_u32_e32 v1, 0x41, v1
398 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
399 ; GFX9-NEXT: s_endpgm
401 ; GFX10-LABEL: v_test_i32_65_sub_x:
403 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
404 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
405 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
406 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
407 ; GFX10-NEXT: s_waitcnt vmcnt(0)
408 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, 0x41, v1
409 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
410 ; GFX10-NEXT: s_endpgm
412 ; GFX11-LABEL: v_test_i32_65_sub_x:
414 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
415 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
416 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
417 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
418 ; GFX11-NEXT: s_waitcnt vmcnt(0)
419 ; GFX11-NEXT: v_sub_nc_u32_e32 v1, 0x41, v1
420 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
421 ; GFX11-NEXT: s_nop 0
422 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
423 ; GFX11-NEXT: s_endpgm
424 %tid = call i32 @llvm.amdgcn.workitem.id.x()
425 %tid.ext = sext i32 %tid to i64
426 %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %tid.ext
427 %gep.out = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %tid.ext
428 %x = load i32, ptr addrspace(1) %gep
429 %result = sub i32 65, %x
430 store i32 %result, ptr addrspace(1) %gep.out
434 define amdgpu_kernel void @v_test_i32_x_sub_neg16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
435 ; SI-LABEL: v_test_i32_x_sub_neg16:
437 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
438 ; SI-NEXT: s_mov_b32 s7, 0xf000
439 ; SI-NEXT: s_mov_b32 s6, 0
440 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
441 ; SI-NEXT: v_mov_b32_e32 v1, 0
442 ; SI-NEXT: s_waitcnt lgkmcnt(0)
443 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
444 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
445 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
446 ; SI-NEXT: s_waitcnt vmcnt(0)
447 ; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v2
448 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
451 ; VI-LABEL: v_test_i32_x_sub_neg16:
453 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
454 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
455 ; VI-NEXT: s_waitcnt lgkmcnt(0)
456 ; VI-NEXT: v_mov_b32_e32 v1, s3
457 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
458 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
459 ; VI-NEXT: flat_load_dword v3, v[0:1]
460 ; VI-NEXT: v_mov_b32_e32 v1, s1
461 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
462 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
463 ; VI-NEXT: s_waitcnt vmcnt(0)
464 ; VI-NEXT: v_add_u32_e32 v2, vcc, 16, v3
465 ; VI-NEXT: flat_store_dword v[0:1], v2
468 ; GFX9-LABEL: v_test_i32_x_sub_neg16:
470 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
471 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
472 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
473 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
474 ; GFX9-NEXT: s_waitcnt vmcnt(0)
475 ; GFX9-NEXT: v_add_u32_e32 v1, 16, v1
476 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
477 ; GFX9-NEXT: s_endpgm
479 ; GFX10-LABEL: v_test_i32_x_sub_neg16:
481 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
482 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
483 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
484 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
485 ; GFX10-NEXT: s_waitcnt vmcnt(0)
486 ; GFX10-NEXT: v_add_nc_u32_e32 v1, 16, v1
487 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
488 ; GFX10-NEXT: s_endpgm
490 ; GFX11-LABEL: v_test_i32_x_sub_neg16:
492 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
493 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
494 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
495 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
496 ; GFX11-NEXT: s_waitcnt vmcnt(0)
497 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 16, v1
498 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
499 ; GFX11-NEXT: s_nop 0
500 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
501 ; GFX11-NEXT: s_endpgm
502 %tid = call i32 @llvm.amdgcn.workitem.id.x()
503 %tid.ext = sext i32 %tid to i64
504 %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %tid.ext
505 %gep.out = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %tid.ext
506 %x = load i32, ptr addrspace(1) %gep
507 %result = sub i32 %x, -16
508 store i32 %result, ptr addrspace(1) %gep.out
512 define amdgpu_kernel void @v_test_i32_neg16_sub_x(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
513 ; SI-LABEL: v_test_i32_neg16_sub_x:
515 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
516 ; SI-NEXT: s_mov_b32 s7, 0xf000
517 ; SI-NEXT: s_mov_b32 s6, 0
518 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
519 ; SI-NEXT: v_mov_b32_e32 v1, 0
520 ; SI-NEXT: s_waitcnt lgkmcnt(0)
521 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
522 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
523 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
524 ; SI-NEXT: s_waitcnt vmcnt(0)
525 ; SI-NEXT: v_sub_i32_e32 v2, vcc, -16, v2
526 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
529 ; VI-LABEL: v_test_i32_neg16_sub_x:
531 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
532 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
533 ; VI-NEXT: s_waitcnt lgkmcnt(0)
534 ; VI-NEXT: v_mov_b32_e32 v1, s3
535 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
536 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
537 ; VI-NEXT: flat_load_dword v3, v[0:1]
538 ; VI-NEXT: v_mov_b32_e32 v1, s1
539 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
540 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
541 ; VI-NEXT: s_waitcnt vmcnt(0)
542 ; VI-NEXT: v_sub_u32_e32 v2, vcc, -16, v3
543 ; VI-NEXT: flat_store_dword v[0:1], v2
546 ; GFX9-LABEL: v_test_i32_neg16_sub_x:
548 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
549 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
550 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
551 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
552 ; GFX9-NEXT: s_waitcnt vmcnt(0)
553 ; GFX9-NEXT: v_sub_u32_e32 v1, -16, v1
554 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
555 ; GFX9-NEXT: s_endpgm
557 ; GFX10-LABEL: v_test_i32_neg16_sub_x:
559 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
560 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
561 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
562 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
563 ; GFX10-NEXT: s_waitcnt vmcnt(0)
564 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, -16, v1
565 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
566 ; GFX10-NEXT: s_endpgm
568 ; GFX11-LABEL: v_test_i32_neg16_sub_x:
570 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
571 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
572 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
573 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
574 ; GFX11-NEXT: s_waitcnt vmcnt(0)
575 ; GFX11-NEXT: v_sub_nc_u32_e32 v1, -16, v1
576 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
577 ; GFX11-NEXT: s_nop 0
578 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
579 ; GFX11-NEXT: s_endpgm
580 %tid = call i32 @llvm.amdgcn.workitem.id.x()
581 %tid.ext = sext i32 %tid to i64
582 %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %tid.ext
583 %gep.out = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %tid.ext
584 %x = load i32, ptr addrspace(1) %gep
585 %result = sub i32 -16, %x
586 store i32 %result, ptr addrspace(1) %gep.out
590 define amdgpu_kernel void @v_test_i32_x_sub_neg17(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
591 ; SI-LABEL: v_test_i32_x_sub_neg17:
593 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
594 ; SI-NEXT: s_mov_b32 s7, 0xf000
595 ; SI-NEXT: s_mov_b32 s6, 0
596 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
597 ; SI-NEXT: v_mov_b32_e32 v1, 0
598 ; SI-NEXT: s_waitcnt lgkmcnt(0)
599 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
600 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
601 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
602 ; SI-NEXT: s_waitcnt vmcnt(0)
603 ; SI-NEXT: v_add_i32_e32 v2, vcc, 17, v2
604 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
607 ; VI-LABEL: v_test_i32_x_sub_neg17:
609 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
610 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
611 ; VI-NEXT: s_waitcnt lgkmcnt(0)
612 ; VI-NEXT: v_mov_b32_e32 v1, s3
613 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
614 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
615 ; VI-NEXT: flat_load_dword v3, v[0:1]
616 ; VI-NEXT: v_mov_b32_e32 v1, s1
617 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
618 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
619 ; VI-NEXT: s_waitcnt vmcnt(0)
620 ; VI-NEXT: v_add_u32_e32 v2, vcc, 17, v3
621 ; VI-NEXT: flat_store_dword v[0:1], v2
624 ; GFX9-LABEL: v_test_i32_x_sub_neg17:
626 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
627 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
628 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
629 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
630 ; GFX9-NEXT: s_waitcnt vmcnt(0)
631 ; GFX9-NEXT: v_add_u32_e32 v1, 17, v1
632 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
633 ; GFX9-NEXT: s_endpgm
635 ; GFX10-LABEL: v_test_i32_x_sub_neg17:
637 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
638 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
639 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
640 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
641 ; GFX10-NEXT: s_waitcnt vmcnt(0)
642 ; GFX10-NEXT: v_add_nc_u32_e32 v1, 17, v1
643 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
644 ; GFX10-NEXT: s_endpgm
646 ; GFX11-LABEL: v_test_i32_x_sub_neg17:
648 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
649 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
650 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
651 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
652 ; GFX11-NEXT: s_waitcnt vmcnt(0)
653 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 17, v1
654 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
655 ; GFX11-NEXT: s_nop 0
656 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
657 ; GFX11-NEXT: s_endpgm
658 %tid = call i32 @llvm.amdgcn.workitem.id.x()
659 %tid.ext = sext i32 %tid to i64
660 %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %tid.ext
661 %gep.out = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %tid.ext
662 %x = load i32, ptr addrspace(1) %gep
663 %result = sub i32 %x, -17
664 store i32 %result, ptr addrspace(1) %gep.out
668 define amdgpu_kernel void @v_test_i32_neg17_sub_x(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
669 ; SI-LABEL: v_test_i32_neg17_sub_x:
671 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
672 ; SI-NEXT: s_mov_b32 s7, 0xf000
673 ; SI-NEXT: s_mov_b32 s6, 0
674 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
675 ; SI-NEXT: v_mov_b32_e32 v1, 0
676 ; SI-NEXT: s_waitcnt lgkmcnt(0)
677 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
678 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
679 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
680 ; SI-NEXT: s_waitcnt vmcnt(0)
681 ; SI-NEXT: v_sub_i32_e32 v2, vcc, 0xffffffef, v2
682 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
685 ; VI-LABEL: v_test_i32_neg17_sub_x:
687 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
688 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
689 ; VI-NEXT: s_waitcnt lgkmcnt(0)
690 ; VI-NEXT: v_mov_b32_e32 v1, s3
691 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
692 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
693 ; VI-NEXT: flat_load_dword v3, v[0:1]
694 ; VI-NEXT: v_mov_b32_e32 v1, s1
695 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
696 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
697 ; VI-NEXT: s_waitcnt vmcnt(0)
698 ; VI-NEXT: v_sub_u32_e32 v2, vcc, 0xffffffef, v3
699 ; VI-NEXT: flat_store_dword v[0:1], v2
702 ; GFX9-LABEL: v_test_i32_neg17_sub_x:
704 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
705 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
706 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
707 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
708 ; GFX9-NEXT: s_waitcnt vmcnt(0)
709 ; GFX9-NEXT: v_sub_u32_e32 v1, 0xffffffef, v1
710 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
711 ; GFX9-NEXT: s_endpgm
713 ; GFX10-LABEL: v_test_i32_neg17_sub_x:
715 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
716 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
717 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
718 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
719 ; GFX10-NEXT: s_waitcnt vmcnt(0)
720 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, 0xffffffef, v1
721 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
722 ; GFX10-NEXT: s_endpgm
724 ; GFX11-LABEL: v_test_i32_neg17_sub_x:
726 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
727 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
728 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
729 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
730 ; GFX11-NEXT: s_waitcnt vmcnt(0)
731 ; GFX11-NEXT: v_sub_nc_u32_e32 v1, 0xffffffef, v1
732 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
733 ; GFX11-NEXT: s_nop 0
734 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
735 ; GFX11-NEXT: s_endpgm
736 %tid = call i32 @llvm.amdgcn.workitem.id.x()
737 %tid.ext = sext i32 %tid to i64
738 %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %tid.ext
739 %gep.out = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %tid.ext
740 %x = load i32, ptr addrspace(1) %gep
741 %result = sub i32 -17, %x
742 store i32 %result, ptr addrspace(1) %gep.out
746 define amdgpu_kernel void @s_test_i32_x_sub_64(i32 %x) #0 {
747 ; SI-LABEL: s_test_i32_x_sub_64:
749 ; SI-NEXT: s_load_dword s0, s[0:1], 0x9
750 ; SI-NEXT: s_waitcnt lgkmcnt(0)
751 ; SI-NEXT: s_sub_i32 s0, s0, 64
752 ; SI-NEXT: ;;#ASMSTART
757 ; VI-LABEL: s_test_i32_x_sub_64:
759 ; VI-NEXT: s_load_dword s0, s[0:1], 0x24
760 ; VI-NEXT: s_waitcnt lgkmcnt(0)
761 ; VI-NEXT: s_sub_i32 s0, s0, 64
762 ; VI-NEXT: ;;#ASMSTART
767 ; GFX9-LABEL: s_test_i32_x_sub_64:
769 ; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24
770 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
771 ; GFX9-NEXT: s_sub_i32 s0, s0, 64
772 ; GFX9-NEXT: ;;#ASMSTART
773 ; GFX9-NEXT: ; use s0
774 ; GFX9-NEXT: ;;#ASMEND
775 ; GFX9-NEXT: s_endpgm
777 ; GFX10-LABEL: s_test_i32_x_sub_64:
779 ; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24
780 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
781 ; GFX10-NEXT: s_sub_i32 s0, s0, 64
782 ; GFX10-NEXT: ;;#ASMSTART
783 ; GFX10-NEXT: ; use s0
784 ; GFX10-NEXT: ;;#ASMEND
785 ; GFX10-NEXT: s_endpgm
787 ; GFX11-LABEL: s_test_i32_x_sub_64:
789 ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24
790 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
791 ; GFX11-NEXT: s_sub_i32 s0, s0, 64
792 ; GFX11-NEXT: ;;#ASMSTART
793 ; GFX11-NEXT: ; use s0
794 ; GFX11-NEXT: ;;#ASMEND
795 ; GFX11-NEXT: s_endpgm
796 %result = sub i32 %x, 64
797 call void asm sideeffect "; use $0", "s"(i32 %result)
801 define amdgpu_kernel void @v_test_i16_x_sub_64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
802 ; SI-LABEL: v_test_i16_x_sub_64:
804 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
805 ; SI-NEXT: s_mov_b32 s7, 0xf000
806 ; SI-NEXT: s_mov_b32 s6, 0
807 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
808 ; SI-NEXT: v_mov_b32_e32 v1, 0
809 ; SI-NEXT: s_waitcnt lgkmcnt(0)
810 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
811 ; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64
812 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
813 ; SI-NEXT: s_waitcnt vmcnt(0)
814 ; SI-NEXT: v_subrev_i32_e32 v2, vcc, 64, v2
815 ; SI-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64
818 ; VI-LABEL: v_test_i16_x_sub_64:
820 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
821 ; VI-NEXT: v_lshlrev_b32_e32 v2, 1, v0
822 ; VI-NEXT: s_waitcnt lgkmcnt(0)
823 ; VI-NEXT: v_mov_b32_e32 v1, s3
824 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
825 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
826 ; VI-NEXT: flat_load_ushort v3, v[0:1]
827 ; VI-NEXT: v_mov_b32_e32 v1, s1
828 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
829 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
830 ; VI-NEXT: s_waitcnt vmcnt(0)
831 ; VI-NEXT: v_subrev_u16_e32 v2, 64, v3
832 ; VI-NEXT: flat_store_short v[0:1], v2
835 ; GFX9-LABEL: v_test_i16_x_sub_64:
837 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
838 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
839 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
840 ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3]
841 ; GFX9-NEXT: s_waitcnt vmcnt(0)
842 ; GFX9-NEXT: v_subrev_u16_e32 v1, 64, v1
843 ; GFX9-NEXT: global_store_short v0, v1, s[0:1]
844 ; GFX9-NEXT: s_endpgm
846 ; GFX10-LABEL: v_test_i16_x_sub_64:
848 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
849 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0
850 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
851 ; GFX10-NEXT: global_load_ushort v1, v0, s[2:3]
852 ; GFX10-NEXT: s_waitcnt vmcnt(0)
853 ; GFX10-NEXT: v_sub_nc_u16 v1, v1, 64
854 ; GFX10-NEXT: global_store_short v0, v1, s[0:1]
855 ; GFX10-NEXT: s_endpgm
857 ; GFX11-LABEL: v_test_i16_x_sub_64:
859 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
860 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
861 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
862 ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3]
863 ; GFX11-NEXT: s_waitcnt vmcnt(0)
864 ; GFX11-NEXT: v_sub_nc_u16 v1, v1, 64
865 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
866 ; GFX11-NEXT: s_nop 0
867 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
868 ; GFX11-NEXT: s_endpgm
869 %tid = call i32 @llvm.amdgcn.workitem.id.x()
870 %tid.ext = sext i32 %tid to i64
871 %gep = getelementptr inbounds i16, ptr addrspace(1) %in, i64 %tid.ext
872 %gep.out = getelementptr inbounds i16, ptr addrspace(1) %out, i64 %tid.ext
873 %x = load i16, ptr addrspace(1) %gep
874 %result = sub i16 %x, 64
875 store i16 %result, ptr addrspace(1) %gep.out
879 define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
880 ; SI-LABEL: v_test_i16_x_sub_64_zext_to_i32:
882 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
883 ; SI-NEXT: s_mov_b32 s7, 0xf000
884 ; SI-NEXT: s_mov_b32 s6, 0
885 ; SI-NEXT: v_lshlrev_b32_e32 v1, 1, v0
886 ; SI-NEXT: v_mov_b32_e32 v2, 0
887 ; SI-NEXT: s_waitcnt lgkmcnt(0)
888 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
889 ; SI-NEXT: buffer_load_ushort v3, v[1:2], s[4:7], 0 addr64
890 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
891 ; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
892 ; SI-NEXT: s_waitcnt vmcnt(0)
893 ; SI-NEXT: v_subrev_i32_e32 v0, vcc, 64, v3
894 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
895 ; SI-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
898 ; VI-LABEL: v_test_i16_x_sub_64_zext_to_i32:
900 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
901 ; VI-NEXT: v_lshlrev_b32_e32 v1, 1, v0
902 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
903 ; VI-NEXT: s_waitcnt lgkmcnt(0)
904 ; VI-NEXT: v_mov_b32_e32 v2, s3
905 ; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1
906 ; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
907 ; VI-NEXT: flat_load_ushort v2, v[1:2]
908 ; VI-NEXT: v_mov_b32_e32 v1, s1
909 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
910 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
911 ; VI-NEXT: s_waitcnt vmcnt(0)
912 ; VI-NEXT: v_subrev_u16_e32 v2, 64, v2
913 ; VI-NEXT: flat_store_dword v[0:1], v2
916 ; GFX9-LABEL: v_test_i16_x_sub_64_zext_to_i32:
918 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
919 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v0
920 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
921 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
922 ; GFX9-NEXT: global_load_ushort v1, v1, s[2:3]
923 ; GFX9-NEXT: s_waitcnt vmcnt(0)
924 ; GFX9-NEXT: v_subrev_u16_e32 v1, 64, v1
925 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
926 ; GFX9-NEXT: s_endpgm
928 ; GFX10-LABEL: v_test_i16_x_sub_64_zext_to_i32:
930 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
931 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 1, v0
932 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
933 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
934 ; GFX10-NEXT: global_load_ushort v1, v1, s[2:3]
935 ; GFX10-NEXT: s_waitcnt vmcnt(0)
936 ; GFX10-NEXT: v_sub_nc_u16 v1, v1, 64
937 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1
938 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
939 ; GFX10-NEXT: s_endpgm
941 ; GFX11-LABEL: v_test_i16_x_sub_64_zext_to_i32:
943 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
944 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 1, v0
945 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
946 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
947 ; GFX11-NEXT: global_load_u16 v1, v1, s[2:3]
948 ; GFX11-NEXT: s_waitcnt vmcnt(0)
949 ; GFX11-NEXT: v_sub_nc_u16 v1, v1, 64
950 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
951 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
952 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
953 ; GFX11-NEXT: s_nop 0
954 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
955 ; GFX11-NEXT: s_endpgm
956 %tid = call i32 @llvm.amdgcn.workitem.id.x()
957 %tid.ext = sext i32 %tid to i64
958 %gep = getelementptr inbounds i16, ptr addrspace(1) %in, i64 %tid.ext
959 %gep.out = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %tid.ext
960 %x = load i16, ptr addrspace(1) %gep
961 %result = sub i16 %x, 64
962 %zext = zext i16 %result to i32
963 store i32 %zext, ptr addrspace(1) %gep.out
967 define amdgpu_kernel void @v_test_i16_x_sub_64_multi_use(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
968 ; SI-LABEL: v_test_i16_x_sub_64_multi_use:
970 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
971 ; SI-NEXT: s_mov_b32 s7, 0xf000
972 ; SI-NEXT: s_mov_b32 s6, 0
973 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
974 ; SI-NEXT: v_mov_b32_e32 v1, 0
975 ; SI-NEXT: s_waitcnt lgkmcnt(0)
976 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
977 ; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc
978 ; SI-NEXT: s_waitcnt vmcnt(0)
979 ; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 glc
980 ; SI-NEXT: s_waitcnt vmcnt(0)
981 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
982 ; SI-NEXT: v_subrev_i32_e32 v2, vcc, 64, v2
983 ; SI-NEXT: v_subrev_i32_e32 v3, vcc, 64, v3
984 ; SI-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64
985 ; SI-NEXT: s_waitcnt vmcnt(0)
986 ; SI-NEXT: buffer_store_short v3, v[0:1], s[0:3], 0 addr64
987 ; SI-NEXT: s_waitcnt vmcnt(0)
990 ; VI-LABEL: v_test_i16_x_sub_64_multi_use:
992 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
993 ; VI-NEXT: v_lshlrev_b32_e32 v2, 1, v0
994 ; VI-NEXT: s_waitcnt lgkmcnt(0)
995 ; VI-NEXT: v_mov_b32_e32 v1, s3
996 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
997 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
998 ; VI-NEXT: flat_load_ushort v3, v[0:1] glc
999 ; VI-NEXT: s_waitcnt vmcnt(0)
1000 ; VI-NEXT: flat_load_ushort v4, v[0:1] glc
1001 ; VI-NEXT: s_waitcnt vmcnt(0)
1002 ; VI-NEXT: v_mov_b32_e32 v1, s1
1003 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
1004 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1005 ; VI-NEXT: v_subrev_u16_e32 v2, 64, v3
1006 ; VI-NEXT: v_subrev_u16_e32 v3, 64, v4
1007 ; VI-NEXT: flat_store_short v[0:1], v2
1008 ; VI-NEXT: s_waitcnt vmcnt(0)
1009 ; VI-NEXT: flat_store_short v[0:1], v3
1010 ; VI-NEXT: s_waitcnt vmcnt(0)
1013 ; GFX9-LABEL: v_test_i16_x_sub_64_multi_use:
1015 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1016 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
1017 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1018 ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc
1019 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1020 ; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] glc
1021 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1022 ; GFX9-NEXT: v_subrev_u16_e32 v1, 64, v1
1023 ; GFX9-NEXT: v_subrev_u16_e32 v2, 64, v2
1024 ; GFX9-NEXT: global_store_short v0, v1, s[0:1]
1025 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1026 ; GFX9-NEXT: global_store_short v0, v2, s[0:1]
1027 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1028 ; GFX9-NEXT: s_endpgm
1030 ; GFX10-LABEL: v_test_i16_x_sub_64_multi_use:
1032 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1033 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0
1034 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1035 ; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
1036 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1037 ; GFX10-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc
1038 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1039 ; GFX10-NEXT: v_sub_nc_u16 v1, v1, 64
1040 ; GFX10-NEXT: v_sub_nc_u16 v2, v2, 64
1041 ; GFX10-NEXT: global_store_short v0, v1, s[0:1]
1042 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1043 ; GFX10-NEXT: global_store_short v0, v2, s[0:1]
1044 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1045 ; GFX10-NEXT: s_endpgm
1047 ; GFX11-LABEL: v_test_i16_x_sub_64_multi_use:
1049 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
1050 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
1051 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1052 ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
1053 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1054 ; GFX11-NEXT: global_load_u16 v2, v0, s[2:3] glc dlc
1055 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1056 ; GFX11-NEXT: v_sub_nc_u16 v1, v1, 64
1057 ; GFX11-NEXT: v_sub_nc_u16 v2, v2, 64
1058 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] dlc
1059 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
1060 ; GFX11-NEXT: global_store_b16 v0, v2, s[0:1] dlc
1061 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
1062 ; GFX11-NEXT: s_nop 0
1063 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1064 ; GFX11-NEXT: s_endpgm
1065 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1066 %tid.ext = sext i32 %tid to i64
1067 %gep = getelementptr inbounds i16, ptr addrspace(1) %in, i64 %tid.ext
1068 %gep.out = getelementptr inbounds i16, ptr addrspace(1) %out, i64 %tid.ext
1069 %x = load volatile i16, ptr addrspace(1) %gep
1070 %y = load volatile i16, ptr addrspace(1) %gep
1071 %result0 = sub i16 %x, 64
1072 %result1 = sub i16 %y, 64
1073 store volatile i16 %result0, ptr addrspace(1) %gep.out
1074 store volatile i16 %result1, ptr addrspace(1) %gep.out
1078 define amdgpu_kernel void @v_test_v2i16_x_sub_64_64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
1079 ; SI-LABEL: v_test_v2i16_x_sub_64_64:
1081 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1082 ; SI-NEXT: s_mov_b32 s7, 0xf000
1083 ; SI-NEXT: s_mov_b32 s6, 0
1084 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1085 ; SI-NEXT: v_mov_b32_e32 v1, 0
1086 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1087 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
1088 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1089 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
1090 ; SI-NEXT: s_waitcnt vmcnt(0)
1091 ; SI-NEXT: v_subrev_i32_e32 v3, vcc, 64, v2
1092 ; SI-NEXT: s_mov_b32 s4, 0xffff0000
1093 ; SI-NEXT: v_bfi_b32 v2, s4, v2, v3
1094 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0xffc00000, v2
1095 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1098 ; VI-LABEL: v_test_v2i16_x_sub_64_64:
1100 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1101 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1102 ; VI-NEXT: v_mov_b32_e32 v4, 64
1103 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1104 ; VI-NEXT: v_mov_b32_e32 v1, s3
1105 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1106 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1107 ; VI-NEXT: flat_load_dword v3, v[0:1]
1108 ; VI-NEXT: v_mov_b32_e32 v1, s1
1109 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
1110 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1111 ; VI-NEXT: s_waitcnt vmcnt(0)
1112 ; VI-NEXT: v_sub_u16_sdwa v2, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1113 ; VI-NEXT: v_subrev_u16_e32 v3, 64, v3
1114 ; VI-NEXT: v_or_b32_e32 v2, v3, v2
1115 ; VI-NEXT: flat_store_dword v[0:1], v2
1118 ; GFX9-LABEL: v_test_v2i16_x_sub_64_64:
1120 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1121 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1122 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1123 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
1124 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1125 ; GFX9-NEXT: v_pk_sub_i16 v1, v1, 64 op_sel_hi:[1,0]
1126 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
1127 ; GFX9-NEXT: s_endpgm
1129 ; GFX10-LABEL: v_test_v2i16_x_sub_64_64:
1131 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1132 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1133 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1134 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
1135 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1136 ; GFX10-NEXT: v_pk_sub_i16 v1, v1, 64 op_sel_hi:[1,0]
1137 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
1138 ; GFX10-NEXT: s_endpgm
1140 ; GFX11-LABEL: v_test_v2i16_x_sub_64_64:
1142 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
1143 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1144 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1145 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
1146 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1147 ; GFX11-NEXT: v_pk_sub_i16 v1, v1, 64 op_sel_hi:[1,0]
1148 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
1149 ; GFX11-NEXT: s_nop 0
1150 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1151 ; GFX11-NEXT: s_endpgm
1152 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1153 %tid.ext = sext i32 %tid to i64
1154 %gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
1155 %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
1156 %x = load <2 x i16>, ptr addrspace(1) %gep
1157 %result = sub <2 x i16> %x, <i16 64, i16 64>
1158 store <2 x i16> %result, ptr addrspace(1) %gep.out
1162 define amdgpu_kernel void @v_test_v2i16_x_sub_7_64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
1163 ; SI-LABEL: v_test_v2i16_x_sub_7_64:
1165 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1166 ; SI-NEXT: s_mov_b32 s7, 0xf000
1167 ; SI-NEXT: s_mov_b32 s6, 0
1168 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1169 ; SI-NEXT: v_mov_b32_e32 v1, 0
1170 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1171 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
1172 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1173 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
1174 ; SI-NEXT: s_waitcnt vmcnt(0)
1175 ; SI-NEXT: v_add_i32_e32 v3, vcc, -7, v2
1176 ; SI-NEXT: s_mov_b32 s4, 0xffff0000
1177 ; SI-NEXT: v_bfi_b32 v2, s4, v2, v3
1178 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0xffc00000, v2
1179 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1182 ; VI-LABEL: v_test_v2i16_x_sub_7_64:
1184 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1185 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1186 ; VI-NEXT: v_mov_b32_e32 v4, 64
1187 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1188 ; VI-NEXT: v_mov_b32_e32 v1, s3
1189 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1190 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1191 ; VI-NEXT: flat_load_dword v3, v[0:1]
1192 ; VI-NEXT: v_mov_b32_e32 v1, s1
1193 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
1194 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1195 ; VI-NEXT: s_waitcnt vmcnt(0)
1196 ; VI-NEXT: v_add_u16_e32 v2, -7, v3
1197 ; VI-NEXT: v_sub_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1198 ; VI-NEXT: v_or_b32_e32 v2, v2, v3
1199 ; VI-NEXT: flat_store_dword v[0:1], v2
1202 ; GFX9-LABEL: v_test_v2i16_x_sub_7_64:
1204 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1205 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1206 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1207 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
1208 ; GFX9-NEXT: s_mov_b32 s2, 0x400007
1209 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1210 ; GFX9-NEXT: v_pk_sub_i16 v1, v1, s2
1211 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
1212 ; GFX9-NEXT: s_endpgm
1214 ; GFX10-LABEL: v_test_v2i16_x_sub_7_64:
1216 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1217 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1218 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1219 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
1220 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1221 ; GFX10-NEXT: v_pk_sub_i16 v1, v1, 0x400007
1222 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
1223 ; GFX10-NEXT: s_endpgm
1225 ; GFX11-LABEL: v_test_v2i16_x_sub_7_64:
1227 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
1228 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1229 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1230 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
1231 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1232 ; GFX11-NEXT: v_pk_sub_i16 v1, v1, 0x400007
1233 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
1234 ; GFX11-NEXT: s_nop 0
1235 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1236 ; GFX11-NEXT: s_endpgm
1237 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1238 %tid.ext = sext i32 %tid to i64
1239 %gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
1240 %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
1241 %x = load <2 x i16>, ptr addrspace(1) %gep
1242 %result = sub <2 x i16> %x, <i16 7, i16 64>
1243 store <2 x i16> %result, ptr addrspace(1) %gep.out
1247 define amdgpu_kernel void @v_test_v2i16_x_sub_64_123(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
1248 ; SI-LABEL: v_test_v2i16_x_sub_64_123:
1250 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1251 ; SI-NEXT: s_mov_b32 s7, 0xf000
1252 ; SI-NEXT: s_mov_b32 s6, 0
1253 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1254 ; SI-NEXT: v_mov_b32_e32 v1, 0
1255 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1256 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
1257 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1258 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
1259 ; SI-NEXT: s_waitcnt vmcnt(0)
1260 ; SI-NEXT: v_subrev_i32_e32 v3, vcc, 64, v2
1261 ; SI-NEXT: s_mov_b32 s4, 0xffff0000
1262 ; SI-NEXT: v_bfi_b32 v2, s4, v2, v3
1263 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0xff850000, v2
1264 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1267 ; VI-LABEL: v_test_v2i16_x_sub_64_123:
1269 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1270 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1271 ; VI-NEXT: v_mov_b32_e32 v4, 0xffffff85
1272 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1273 ; VI-NEXT: v_mov_b32_e32 v1, s3
1274 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1275 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1276 ; VI-NEXT: flat_load_dword v3, v[0:1]
1277 ; VI-NEXT: v_mov_b32_e32 v1, s1
1278 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
1279 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1280 ; VI-NEXT: s_waitcnt vmcnt(0)
1281 ; VI-NEXT: v_add_u16_sdwa v2, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1282 ; VI-NEXT: v_subrev_u16_e32 v3, 64, v3
1283 ; VI-NEXT: v_or_b32_e32 v2, v3, v2
1284 ; VI-NEXT: flat_store_dword v[0:1], v2
1287 ; GFX9-LABEL: v_test_v2i16_x_sub_64_123:
1289 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1290 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1291 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1292 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
1293 ; GFX9-NEXT: s_mov_b32 s2, 0x7b0040
1294 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1295 ; GFX9-NEXT: v_pk_sub_i16 v1, v1, s2
1296 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
1297 ; GFX9-NEXT: s_endpgm
1299 ; GFX10-LABEL: v_test_v2i16_x_sub_64_123:
1301 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1302 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1303 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1304 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
1305 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1306 ; GFX10-NEXT: v_pk_sub_i16 v1, v1, 0x7b0040
1307 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
1308 ; GFX10-NEXT: s_endpgm
1310 ; GFX11-LABEL: v_test_v2i16_x_sub_64_123:
1312 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
1313 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1314 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1315 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
1316 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1317 ; GFX11-NEXT: v_pk_sub_i16 v1, v1, 0x7b0040
1318 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
1319 ; GFX11-NEXT: s_nop 0
1320 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1321 ; GFX11-NEXT: s_endpgm
1322 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1323 %tid.ext = sext i32 %tid to i64
1324 %gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
1325 %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
1326 %x = load <2 x i16>, ptr addrspace(1) %gep
1327 %result = sub <2 x i16> %x, <i16 64, i16 123>
1328 store <2 x i16> %result, ptr addrspace(1) %gep.out
1332 ; Can fold 0 and inline immediate in other half.
1333 define amdgpu_kernel void @v_test_v2i16_x_sub_7_0(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
1334 ; SI-LABEL: v_test_v2i16_x_sub_7_0:
1336 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1337 ; SI-NEXT: s_mov_b32 s7, 0xf000
1338 ; SI-NEXT: s_mov_b32 s6, 0
1339 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1340 ; SI-NEXT: v_mov_b32_e32 v1, 0
1341 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1342 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
1343 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1344 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
1345 ; SI-NEXT: s_waitcnt vmcnt(0)
1346 ; SI-NEXT: v_add_i32_e32 v3, vcc, -7, v2
1347 ; SI-NEXT: s_mov_b32 s4, 0xffff
1348 ; SI-NEXT: v_bfi_b32 v2, s4, v3, v2
1349 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1352 ; VI-LABEL: v_test_v2i16_x_sub_7_0:
1354 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1355 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1356 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1357 ; VI-NEXT: v_mov_b32_e32 v1, s3
1358 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1359 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1360 ; VI-NEXT: flat_load_dword v3, v[0:1]
1361 ; VI-NEXT: v_mov_b32_e32 v1, s1
1362 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
1363 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1364 ; VI-NEXT: s_waitcnt vmcnt(0)
1365 ; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
1366 ; VI-NEXT: v_add_u16_e32 v3, -7, v3
1367 ; VI-NEXT: v_or_b32_e32 v2, v3, v2
1368 ; VI-NEXT: flat_store_dword v[0:1], v2
1371 ; GFX9-LABEL: v_test_v2i16_x_sub_7_0:
1373 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1374 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1375 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1376 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
1377 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1378 ; GFX9-NEXT: v_pk_sub_i16 v1, v1, 7
1379 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
1380 ; GFX9-NEXT: s_endpgm
1382 ; GFX10-LABEL: v_test_v2i16_x_sub_7_0:
1384 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1385 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1386 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1387 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
1388 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1389 ; GFX10-NEXT: v_pk_sub_i16 v1, v1, 7
1390 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
1391 ; GFX10-NEXT: s_endpgm
1393 ; GFX11-LABEL: v_test_v2i16_x_sub_7_0:
1395 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
1396 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1397 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1398 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
1399 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1400 ; GFX11-NEXT: v_pk_sub_i16 v1, v1, 7
1401 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
1402 ; GFX11-NEXT: s_nop 0
1403 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1404 ; GFX11-NEXT: s_endpgm
1405 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1406 %tid.ext = sext i32 %tid to i64
1407 %gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
1408 %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
1409 %x = load <2 x i16>, ptr addrspace(1) %gep
1410 %result = sub <2 x i16> %x, <i16 7, i16 0>
1411 store <2 x i16> %result, ptr addrspace(1) %gep.out
1415 ; Can fold 0 and inline immediate in other half.
1416 define amdgpu_kernel void @v_test_v2i16_x_sub_0_16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
1417 ; SI-LABEL: v_test_v2i16_x_sub_0_16:
1419 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1420 ; SI-NEXT: s_mov_b32 s7, 0xf000
1421 ; SI-NEXT: s_mov_b32 s6, 0
1422 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1423 ; SI-NEXT: v_mov_b32_e32 v1, 0
1424 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1425 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
1426 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1427 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
1428 ; SI-NEXT: s_waitcnt vmcnt(0)
1429 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0xfff00000, v2
1430 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1433 ; VI-LABEL: v_test_v2i16_x_sub_0_16:
1435 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1436 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1437 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1438 ; VI-NEXT: v_mov_b32_e32 v1, s3
1439 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1440 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1441 ; VI-NEXT: flat_load_dword v3, v[0:1]
1442 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
1443 ; VI-NEXT: v_mov_b32_e32 v2, -16
1444 ; VI-NEXT: v_mov_b32_e32 v1, s1
1445 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1446 ; VI-NEXT: s_waitcnt vmcnt(0)
1447 ; VI-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1448 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1449 ; VI-NEXT: flat_store_dword v[0:1], v2
1452 ; GFX9-LABEL: v_test_v2i16_x_sub_0_16:
1454 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1455 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1456 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1457 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
1458 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1459 ; GFX9-NEXT: v_pk_sub_i16 v1, v1, 16 op_sel:[0,1] op_sel_hi:[1,0]
1460 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
1461 ; GFX9-NEXT: s_endpgm
1463 ; GFX10-LABEL: v_test_v2i16_x_sub_0_16:
1465 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1466 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1467 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1468 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
1469 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1470 ; GFX10-NEXT: v_pk_sub_i16 v1, v1, 16 op_sel:[0,1] op_sel_hi:[1,0]
1471 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
1472 ; GFX10-NEXT: s_endpgm
1474 ; GFX11-LABEL: v_test_v2i16_x_sub_0_16:
1476 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
1477 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1478 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1479 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
1480 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1481 ; GFX11-NEXT: v_pk_sub_i16 v1, v1, 16 op_sel:[0,1] op_sel_hi:[1,0]
1482 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
1483 ; GFX11-NEXT: s_nop 0
1484 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1485 ; GFX11-NEXT: s_endpgm
1486 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1487 %tid.ext = sext i32 %tid to i64
1488 %gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
1489 %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
1490 %x = load <2 x i16>, ptr addrspace(1) %gep
1491 %result = sub <2 x i16> %x, <i16 0, i16 16>
1492 store <2 x i16> %result, ptr addrspace(1) %gep.out
1496 define amdgpu_kernel void @v_test_v2i16_x_sub_0_1_0(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
1497 ; SI-LABEL: v_test_v2i16_x_sub_0_1_0:
1499 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1500 ; SI-NEXT: s_mov_b32 s7, 0xf000
1501 ; SI-NEXT: s_mov_b32 s6, 0
1502 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1503 ; SI-NEXT: v_mov_b32_e32 v1, 0
1504 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1505 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
1506 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1507 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
1508 ; SI-NEXT: s_waitcnt vmcnt(0)
1509 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x3c000000, v2
1510 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1513 ; VI-LABEL: v_test_v2i16_x_sub_0_1_0:
1515 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1516 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1517 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1518 ; VI-NEXT: v_mov_b32_e32 v1, s3
1519 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1520 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1521 ; VI-NEXT: flat_load_dword v3, v[0:1]
1522 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
1523 ; VI-NEXT: v_mov_b32_e32 v2, 0x3c00
1524 ; VI-NEXT: v_mov_b32_e32 v1, s1
1525 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1526 ; VI-NEXT: s_waitcnt vmcnt(0)
1527 ; VI-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1528 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1529 ; VI-NEXT: flat_store_dword v[0:1], v2
1532 ; GFX9-LABEL: v_test_v2i16_x_sub_0_1_0:
1534 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1535 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1536 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1537 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
1538 ; GFX9-NEXT: s_brev_b32 s2, 35
1539 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1540 ; GFX9-NEXT: v_pk_sub_i16 v1, v1, s2
1541 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
1542 ; GFX9-NEXT: s_endpgm
1544 ; GFX10-LABEL: v_test_v2i16_x_sub_0_1_0:
1546 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1547 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1548 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1549 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
1550 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1551 ; GFX10-NEXT: v_pk_sub_i16 v1, v1, 0xc400 op_sel:[0,1] op_sel_hi:[1,0]
1552 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
1553 ; GFX10-NEXT: s_endpgm
1555 ; GFX11-LABEL: v_test_v2i16_x_sub_0_1_0:
1557 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
1558 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1559 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1560 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
1561 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1562 ; GFX11-NEXT: v_pk_sub_i16 v1, v1, 0xc400 op_sel:[0,1] op_sel_hi:[1,0]
1563 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
1564 ; GFX11-NEXT: s_nop 0
1565 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1566 ; GFX11-NEXT: s_endpgm
1567 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1568 %tid.ext = sext i32 %tid to i64
1569 %gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
1570 %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
1571 %x = load <2 x i16>, ptr addrspace(1) %gep
1572 %result = sub <2 x i16> %x, <i16 0, i16 -15360>
1573 store <2 x i16> %result, ptr addrspace(1) %gep.out
1577 define amdgpu_kernel void @v_test_v2i16_x_sub_0_neg1_0(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
1578 ; SI-LABEL: v_test_v2i16_x_sub_0_neg1_0:
1580 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1581 ; SI-NEXT: s_mov_b32 s7, 0xf000
1582 ; SI-NEXT: s_mov_b32 s6, 0
1583 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1584 ; SI-NEXT: v_mov_b32_e32 v1, 0
1585 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1586 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
1587 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1588 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
1589 ; SI-NEXT: s_waitcnt vmcnt(0)
1590 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0xbc000000, v2
1591 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1594 ; VI-LABEL: v_test_v2i16_x_sub_0_neg1_0:
1596 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1597 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1598 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1599 ; VI-NEXT: v_mov_b32_e32 v1, s3
1600 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1601 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1602 ; VI-NEXT: flat_load_dword v3, v[0:1]
1603 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
1604 ; VI-NEXT: v_mov_b32_e32 v2, 0xffffbc00
1605 ; VI-NEXT: v_mov_b32_e32 v1, s1
1606 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1607 ; VI-NEXT: s_waitcnt vmcnt(0)
1608 ; VI-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1609 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1610 ; VI-NEXT: flat_store_dword v[0:1], v2
1613 ; GFX9-LABEL: v_test_v2i16_x_sub_0_neg1_0:
1615 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1616 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1617 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1618 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
1619 ; GFX9-NEXT: s_brev_b32 s2, 34
1620 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1621 ; GFX9-NEXT: v_pk_sub_i16 v1, v1, s2
1622 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
1623 ; GFX9-NEXT: s_endpgm
1625 ; GFX10-LABEL: v_test_v2i16_x_sub_0_neg1_0:
1627 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1628 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1629 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1630 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
1631 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1632 ; GFX10-NEXT: v_pk_sub_i16 v1, v1, 0x4400 op_sel:[0,1] op_sel_hi:[1,0]
1633 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
1634 ; GFX10-NEXT: s_endpgm
1636 ; GFX11-LABEL: v_test_v2i16_x_sub_0_neg1_0:
1638 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
1639 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1640 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1641 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
1642 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1643 ; GFX11-NEXT: v_pk_sub_i16 v1, v1, 0x4400 op_sel:[0,1] op_sel_hi:[1,0]
1644 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
1645 ; GFX11-NEXT: s_nop 0
1646 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1647 ; GFX11-NEXT: s_endpgm
1648 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1649 %tid.ext = sext i32 %tid to i64
1650 %gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
1651 %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
1652 %x = load <2 x i16>, ptr addrspace(1) %gep
1653 %result = sub <2 x i16> %x, <i16 0, i16 17408>
1654 store <2 x i16> %result, ptr addrspace(1) %gep.out
1658 ; -32 isn't an inline immediate, but 32 is
1659 define amdgpu_kernel void @v_test_v2i16_x_add_neg32_neg32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
1660 ; SI-LABEL: v_test_v2i16_x_add_neg32_neg32:
1662 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1663 ; SI-NEXT: s_mov_b32 s7, 0xf000
1664 ; SI-NEXT: s_mov_b32 s6, 0
1665 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1666 ; SI-NEXT: v_mov_b32_e32 v1, 0
1667 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1668 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
1669 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1670 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
1671 ; SI-NEXT: s_waitcnt vmcnt(0)
1672 ; SI-NEXT: v_subrev_i32_e32 v3, vcc, 32, v2
1673 ; SI-NEXT: s_mov_b32 s4, 0xffff0000
1674 ; SI-NEXT: v_bfi_b32 v2, s4, v2, v3
1675 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0xffe00000, v2
1676 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1679 ; VI-LABEL: v_test_v2i16_x_add_neg32_neg32:
1681 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1682 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1683 ; VI-NEXT: v_mov_b32_e32 v4, 32
1684 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1685 ; VI-NEXT: v_mov_b32_e32 v1, s3
1686 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1687 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1688 ; VI-NEXT: flat_load_dword v3, v[0:1]
1689 ; VI-NEXT: v_mov_b32_e32 v1, s1
1690 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
1691 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1692 ; VI-NEXT: s_waitcnt vmcnt(0)
1693 ; VI-NEXT: v_sub_u16_sdwa v2, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1694 ; VI-NEXT: v_subrev_u16_e32 v3, 32, v3
1695 ; VI-NEXT: v_or_b32_e32 v2, v3, v2
1696 ; VI-NEXT: flat_store_dword v[0:1], v2
1699 ; GFX9-LABEL: v_test_v2i16_x_add_neg32_neg32:
1701 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1702 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1703 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1704 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
1705 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1706 ; GFX9-NEXT: v_pk_sub_u16 v1, v1, 32 op_sel_hi:[1,0]
1707 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
1708 ; GFX9-NEXT: s_endpgm
1710 ; GFX10-LABEL: v_test_v2i16_x_add_neg32_neg32:
1712 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1713 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1714 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1715 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
1716 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1717 ; GFX10-NEXT: v_pk_sub_u16 v1, v1, 32 op_sel_hi:[1,0]
1718 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
1719 ; GFX10-NEXT: s_endpgm
1721 ; GFX11-LABEL: v_test_v2i16_x_add_neg32_neg32:
1723 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
1724 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1725 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1726 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
1727 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1728 ; GFX11-NEXT: v_pk_sub_u16 v1, v1, 32 op_sel_hi:[1,0]
1729 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
1730 ; GFX11-NEXT: s_nop 0
1731 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1732 ; GFX11-NEXT: s_endpgm
1733 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1734 %tid.ext = sext i32 %tid to i64
1735 %gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
1736 %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
1737 %x = load <2 x i16>, ptr addrspace(1) %gep
1738 %result = add <2 x i16> %x, <i16 -32, i16 -32>
1739 store <2 x i16> %result, ptr addrspace(1) %gep.out
1743 define amdgpu_kernel void @v_test_v2i16_x_add_0_neg32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
1744 ; SI-LABEL: v_test_v2i16_x_add_0_neg32:
1746 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1747 ; SI-NEXT: s_mov_b32 s7, 0xf000
1748 ; SI-NEXT: s_mov_b32 s6, 0
1749 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1750 ; SI-NEXT: v_mov_b32_e32 v1, 0
1751 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1752 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
1753 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1754 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
1755 ; SI-NEXT: s_waitcnt vmcnt(0)
1756 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0xffe00000, v2
1757 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1760 ; VI-LABEL: v_test_v2i16_x_add_0_neg32:
1762 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1763 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1764 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1765 ; VI-NEXT: v_mov_b32_e32 v1, s3
1766 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1767 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1768 ; VI-NEXT: flat_load_dword v3, v[0:1]
1769 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
1770 ; VI-NEXT: v_mov_b32_e32 v2, 32
1771 ; VI-NEXT: v_mov_b32_e32 v1, s1
1772 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1773 ; VI-NEXT: s_waitcnt vmcnt(0)
1774 ; VI-NEXT: v_sub_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1775 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1776 ; VI-NEXT: flat_store_dword v[0:1], v2
1779 ; GFX9-LABEL: v_test_v2i16_x_add_0_neg32:
1781 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1782 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1783 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1784 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
1785 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1786 ; GFX9-NEXT: v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0]
1787 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
1788 ; GFX9-NEXT: s_endpgm
1790 ; GFX10-LABEL: v_test_v2i16_x_add_0_neg32:
1792 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1793 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1794 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1795 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
1796 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1797 ; GFX10-NEXT: v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0]
1798 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
1799 ; GFX10-NEXT: s_endpgm
1801 ; GFX11-LABEL: v_test_v2i16_x_add_0_neg32:
1803 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
1804 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1805 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1806 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
1807 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1808 ; GFX11-NEXT: v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0]
1809 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
1810 ; GFX11-NEXT: s_nop 0
1811 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1812 ; GFX11-NEXT: s_endpgm
1813 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1814 %tid.ext = sext i32 %tid to i64
1815 %gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
1816 %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
1817 %x = load <2 x i16>, ptr addrspace(1) %gep
1818 %result = add <2 x i16> %x, <i16 0, i16 -32>
1819 store <2 x i16> %result, ptr addrspace(1) %gep.out
1823 define amdgpu_kernel void @v_test_v2i16_x_add_neg32_0(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
1824 ; SI-LABEL: v_test_v2i16_x_add_neg32_0:
1826 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1827 ; SI-NEXT: s_mov_b32 s7, 0xf000
1828 ; SI-NEXT: s_mov_b32 s6, 0
1829 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1830 ; SI-NEXT: v_mov_b32_e32 v1, 0
1831 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1832 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
1833 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1834 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
1835 ; SI-NEXT: s_waitcnt vmcnt(0)
1836 ; SI-NEXT: v_subrev_i32_e32 v3, vcc, 32, v2
1837 ; SI-NEXT: s_mov_b32 s4, 0xffff
1838 ; SI-NEXT: v_bfi_b32 v2, s4, v3, v2
1839 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1842 ; VI-LABEL: v_test_v2i16_x_add_neg32_0:
1844 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1845 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1846 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1847 ; VI-NEXT: v_mov_b32_e32 v1, s3
1848 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1849 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1850 ; VI-NEXT: flat_load_dword v3, v[0:1]
1851 ; VI-NEXT: v_mov_b32_e32 v1, s1
1852 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
1853 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1854 ; VI-NEXT: s_waitcnt vmcnt(0)
1855 ; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
1856 ; VI-NEXT: v_subrev_u16_e32 v3, 32, v3
1857 ; VI-NEXT: v_or_b32_e32 v2, v3, v2
1858 ; VI-NEXT: flat_store_dword v[0:1], v2
1861 ; GFX9-LABEL: v_test_v2i16_x_add_neg32_0:
1863 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1864 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1865 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1866 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
1867 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1868 ; GFX9-NEXT: v_pk_sub_u16 v1, v1, 32
1869 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
1870 ; GFX9-NEXT: s_endpgm
1872 ; GFX10-LABEL: v_test_v2i16_x_add_neg32_0:
1874 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1875 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1876 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1877 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
1878 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1879 ; GFX10-NEXT: v_pk_sub_u16 v1, v1, 32
1880 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
1881 ; GFX10-NEXT: s_endpgm
1883 ; GFX11-LABEL: v_test_v2i16_x_add_neg32_0:
1885 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
1886 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1887 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1888 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
1889 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1890 ; GFX11-NEXT: v_pk_sub_u16 v1, v1, 32
1891 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
1892 ; GFX11-NEXT: s_nop 0
1893 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1894 ; GFX11-NEXT: s_endpgm
1895 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1896 %tid.ext = sext i32 %tid to i64
1897 %gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
1898 %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
1899 %x = load <2 x i16>, ptr addrspace(1) %gep
1900 %result = add <2 x i16> %x, <i16 -32, i16 0>
1901 store <2 x i16> %result, ptr addrspace(1) %gep.out
1905 ; 16 and -16 are both inline immediates
1906 define amdgpu_kernel void @v_test_v2i16_x_add_neg16_neg16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
1907 ; SI-LABEL: v_test_v2i16_x_add_neg16_neg16:
1909 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1910 ; SI-NEXT: s_mov_b32 s7, 0xf000
1911 ; SI-NEXT: s_mov_b32 s6, 0
1912 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1913 ; SI-NEXT: v_mov_b32_e32 v1, 0
1914 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1915 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
1916 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1917 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
1918 ; SI-NEXT: s_waitcnt vmcnt(0)
1919 ; SI-NEXT: v_add_i32_e32 v3, vcc, -16, v2
1920 ; SI-NEXT: s_mov_b32 s4, 0xffff0000
1921 ; SI-NEXT: v_bfi_b32 v2, s4, v2, v3
1922 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0xfff00000, v2
1923 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1926 ; VI-LABEL: v_test_v2i16_x_add_neg16_neg16:
1928 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1929 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1930 ; VI-NEXT: v_mov_b32_e32 v4, -16
1931 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1932 ; VI-NEXT: v_mov_b32_e32 v1, s3
1933 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1934 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1935 ; VI-NEXT: flat_load_dword v3, v[0:1]
1936 ; VI-NEXT: v_mov_b32_e32 v1, s1
1937 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
1938 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1939 ; VI-NEXT: s_waitcnt vmcnt(0)
1940 ; VI-NEXT: v_add_u16_e32 v2, -16, v3
1941 ; VI-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1942 ; VI-NEXT: v_or_b32_e32 v2, v2, v3
1943 ; VI-NEXT: flat_store_dword v[0:1], v2
1946 ; GFX9-LABEL: v_test_v2i16_x_add_neg16_neg16:
1948 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1949 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1950 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1951 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
1952 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1953 ; GFX9-NEXT: v_pk_sub_u16 v1, v1, 16 op_sel_hi:[1,0]
1954 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
1955 ; GFX9-NEXT: s_endpgm
1957 ; GFX10-LABEL: v_test_v2i16_x_add_neg16_neg16:
1959 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1960 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1961 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1962 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
1963 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1964 ; GFX10-NEXT: v_pk_sub_u16 v1, v1, 16 op_sel_hi:[1,0]
1965 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
1966 ; GFX10-NEXT: s_endpgm
1968 ; GFX11-LABEL: v_test_v2i16_x_add_neg16_neg16:
1970 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
1971 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1972 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1973 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
1974 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1975 ; GFX11-NEXT: v_pk_sub_u16 v1, v1, 16 op_sel_hi:[1,0]
1976 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
1977 ; GFX11-NEXT: s_nop 0
1978 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1979 ; GFX11-NEXT: s_endpgm
1980 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1981 %tid.ext = sext i32 %tid to i64
1982 %gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
1983 %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
1984 %x = load <2 x i16>, ptr addrspace(1) %gep
1985 %result = add <2 x i16> %x, <i16 -16, i16 -16>
1986 store <2 x i16> %result, ptr addrspace(1) %gep.out
1990 define amdgpu_kernel void @v_test_v2i16_x_add_0_neg16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
1991 ; SI-LABEL: v_test_v2i16_x_add_0_neg16:
1993 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1994 ; SI-NEXT: s_mov_b32 s7, 0xf000
1995 ; SI-NEXT: s_mov_b32 s6, 0
1996 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1997 ; SI-NEXT: v_mov_b32_e32 v1, 0
1998 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1999 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
2000 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
2001 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
2002 ; SI-NEXT: s_waitcnt vmcnt(0)
2003 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0xfff00000, v2
2004 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
2007 ; VI-LABEL: v_test_v2i16_x_add_0_neg16:
2009 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
2010 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
2011 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2012 ; VI-NEXT: v_mov_b32_e32 v1, s3
2013 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
2014 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2015 ; VI-NEXT: flat_load_dword v3, v[0:1]
2016 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
2017 ; VI-NEXT: v_mov_b32_e32 v2, -16
2018 ; VI-NEXT: v_mov_b32_e32 v1, s1
2019 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2020 ; VI-NEXT: s_waitcnt vmcnt(0)
2021 ; VI-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
2022 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2023 ; VI-NEXT: flat_store_dword v[0:1], v2
2026 ; GFX9-LABEL: v_test_v2i16_x_add_0_neg16:
2028 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
2029 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2030 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2031 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
2032 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2033 ; GFX9-NEXT: v_pk_sub_u16 v1, v1, 16 op_sel:[0,1] op_sel_hi:[1,0]
2034 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
2035 ; GFX9-NEXT: s_endpgm
2037 ; GFX10-LABEL: v_test_v2i16_x_add_0_neg16:
2039 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
2040 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2041 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
2042 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
2043 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2044 ; GFX10-NEXT: v_pk_sub_u16 v1, v1, 16 op_sel:[0,1] op_sel_hi:[1,0]
2045 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
2046 ; GFX10-NEXT: s_endpgm
2048 ; GFX11-LABEL: v_test_v2i16_x_add_0_neg16:
2050 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
2051 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2052 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2053 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
2054 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2055 ; GFX11-NEXT: v_pk_sub_u16 v1, v1, 16 op_sel:[0,1] op_sel_hi:[1,0]
2056 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
2057 ; GFX11-NEXT: s_nop 0
2058 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2059 ; GFX11-NEXT: s_endpgm
2060 %tid = call i32 @llvm.amdgcn.workitem.id.x()
2061 %tid.ext = sext i32 %tid to i64
2062 %gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
2063 %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
2064 %x = load <2 x i16>, ptr addrspace(1) %gep
2065 %result = add <2 x i16> %x, <i16 0, i16 -16>
2066 store <2 x i16> %result, ptr addrspace(1) %gep.out
2070 define amdgpu_kernel void @v_test_v2i16_x_add_neg16_0(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
2071 ; SI-LABEL: v_test_v2i16_x_add_neg16_0:
2073 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
2074 ; SI-NEXT: s_mov_b32 s7, 0xf000
2075 ; SI-NEXT: s_mov_b32 s6, 0
2076 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2077 ; SI-NEXT: v_mov_b32_e32 v1, 0
2078 ; SI-NEXT: s_waitcnt lgkmcnt(0)
2079 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
2080 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
2081 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
2082 ; SI-NEXT: s_waitcnt vmcnt(0)
2083 ; SI-NEXT: v_add_i32_e32 v3, vcc, -16, v2
2084 ; SI-NEXT: s_mov_b32 s4, 0xffff
2085 ; SI-NEXT: v_bfi_b32 v2, s4, v3, v2
2086 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
2089 ; VI-LABEL: v_test_v2i16_x_add_neg16_0:
2091 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
2092 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
2093 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2094 ; VI-NEXT: v_mov_b32_e32 v1, s3
2095 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
2096 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2097 ; VI-NEXT: flat_load_dword v3, v[0:1]
2098 ; VI-NEXT: v_mov_b32_e32 v1, s1
2099 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
2100 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2101 ; VI-NEXT: s_waitcnt vmcnt(0)
2102 ; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
2103 ; VI-NEXT: v_add_u16_e32 v3, -16, v3
2104 ; VI-NEXT: v_or_b32_e32 v2, v3, v2
2105 ; VI-NEXT: flat_store_dword v[0:1], v2
2108 ; GFX9-LABEL: v_test_v2i16_x_add_neg16_0:
2110 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
2111 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2112 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2113 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
2114 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2115 ; GFX9-NEXT: v_pk_sub_u16 v1, v1, 16
2116 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
2117 ; GFX9-NEXT: s_endpgm
2119 ; GFX10-LABEL: v_test_v2i16_x_add_neg16_0:
2121 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
2122 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2123 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
2124 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
2125 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2126 ; GFX10-NEXT: v_pk_sub_u16 v1, v1, 16
2127 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
2128 ; GFX10-NEXT: s_endpgm
2130 ; GFX11-LABEL: v_test_v2i16_x_add_neg16_0:
2132 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
2133 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2134 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2135 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
2136 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2137 ; GFX11-NEXT: v_pk_sub_u16 v1, v1, 16
2138 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
2139 ; GFX11-NEXT: s_nop 0
2140 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2141 ; GFX11-NEXT: s_endpgm
2142 %tid = call i32 @llvm.amdgcn.workitem.id.x()
2143 %tid.ext = sext i32 %tid to i64
2144 %gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
2145 %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
2146 %x = load <2 x i16>, ptr addrspace(1) %gep
2147 %result = add <2 x i16> %x, <i16 -16, i16 0>
2148 store <2 x i16> %result, ptr addrspace(1) %gep.out
2152 define amdgpu_kernel void @v_test_v2i16_x_add_neg_fpone(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
2153 ; SI-LABEL: v_test_v2i16_x_add_neg_fpone:
2155 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
2156 ; SI-NEXT: s_mov_b32 s7, 0xf000
2157 ; SI-NEXT: s_mov_b32 s6, 0
2158 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2159 ; SI-NEXT: v_mov_b32_e32 v1, 0
2160 ; SI-NEXT: s_waitcnt lgkmcnt(0)
2161 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
2162 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
2163 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
2164 ; SI-NEXT: s_waitcnt vmcnt(0)
2165 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0xffffc400, v2
2166 ; SI-NEXT: s_mov_b32 s4, 0xffff0000
2167 ; SI-NEXT: v_bfi_b32 v2, s4, v2, v3
2168 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0xc4000000, v2
2169 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
2172 ; VI-LABEL: v_test_v2i16_x_add_neg_fpone:
2174 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
2175 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
2176 ; VI-NEXT: v_mov_b32_e32 v4, 0xffffc400
2177 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2178 ; VI-NEXT: v_mov_b32_e32 v1, s3
2179 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
2180 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2181 ; VI-NEXT: flat_load_dword v3, v[0:1]
2182 ; VI-NEXT: v_mov_b32_e32 v1, s1
2183 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
2184 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2185 ; VI-NEXT: s_waitcnt vmcnt(0)
2186 ; VI-NEXT: v_add_u16_e32 v2, 0xc400, v3
2187 ; VI-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
2188 ; VI-NEXT: v_or_b32_e32 v2, v2, v3
2189 ; VI-NEXT: flat_store_dword v[0:1], v2
2192 ; GFX9-LABEL: v_test_v2i16_x_add_neg_fpone:
2194 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
2195 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2196 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2197 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
2198 ; GFX9-NEXT: s_mov_b32 s2, 0x3c003c00
2199 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2200 ; GFX9-NEXT: v_pk_sub_u16 v1, v1, s2
2201 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
2202 ; GFX9-NEXT: s_endpgm
2204 ; GFX10-LABEL: v_test_v2i16_x_add_neg_fpone:
2206 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
2207 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2208 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
2209 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
2210 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2211 ; GFX10-NEXT: v_pk_sub_u16 v1, v1, 0x3c00 op_sel_hi:[1,0]
2212 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
2213 ; GFX10-NEXT: s_endpgm
2215 ; GFX11-LABEL: v_test_v2i16_x_add_neg_fpone:
2217 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
2218 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2219 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2220 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
2221 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2222 ; GFX11-NEXT: v_pk_sub_u16 v1, v1, 0x3c00 op_sel_hi:[1,0]
2223 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
2224 ; GFX11-NEXT: s_nop 0
2225 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2226 ; GFX11-NEXT: s_endpgm
2227 %tid = call i32 @llvm.amdgcn.workitem.id.x()
2228 %tid.ext = sext i32 %tid to i64
2229 %gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
2230 %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
2231 %x = load <2 x i16>, ptr addrspace(1) %gep
2232 %result = add <2 x i16> %x, <i16 -15360, i16 -15360>
2233 store <2 x i16> %result, ptr addrspace(1) %gep.out
2237 define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfpone(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
2238 ; SI-LABEL: v_test_v2i16_x_add_neg_negfpone:
2240 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
2241 ; SI-NEXT: s_mov_b32 s7, 0xf000
2242 ; SI-NEXT: s_mov_b32 s6, 0
2243 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2244 ; SI-NEXT: v_mov_b32_e32 v1, 0
2245 ; SI-NEXT: s_waitcnt lgkmcnt(0)
2246 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
2247 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
2248 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
2249 ; SI-NEXT: s_waitcnt vmcnt(0)
2250 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4400, v2
2251 ; SI-NEXT: s_mov_b32 s4, 0xffff0000
2252 ; SI-NEXT: v_bfi_b32 v2, s4, v2, v3
2253 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44000000, v2
2254 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
2257 ; VI-LABEL: v_test_v2i16_x_add_neg_negfpone:
2259 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
2260 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
2261 ; VI-NEXT: v_mov_b32_e32 v4, 0x4400
2262 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2263 ; VI-NEXT: v_mov_b32_e32 v1, s3
2264 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
2265 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2266 ; VI-NEXT: flat_load_dword v3, v[0:1]
2267 ; VI-NEXT: v_mov_b32_e32 v1, s1
2268 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
2269 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2270 ; VI-NEXT: s_waitcnt vmcnt(0)
2271 ; VI-NEXT: v_add_u16_e32 v2, 0x4400, v3
2272 ; VI-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
2273 ; VI-NEXT: v_or_b32_e32 v2, v2, v3
2274 ; VI-NEXT: flat_store_dword v[0:1], v2
2277 ; GFX9-LABEL: v_test_v2i16_x_add_neg_negfpone:
2279 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
2280 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2281 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2282 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
2283 ; GFX9-NEXT: s_mov_b32 s2, 0xbc00bc00
2284 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2285 ; GFX9-NEXT: v_pk_sub_u16 v1, v1, s2
2286 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
2287 ; GFX9-NEXT: s_endpgm
2289 ; GFX10-LABEL: v_test_v2i16_x_add_neg_negfpone:
2291 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
2292 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2293 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
2294 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
2295 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2296 ; GFX10-NEXT: v_pk_sub_u16 v1, v1, 0xbc00 op_sel_hi:[1,0]
2297 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
2298 ; GFX10-NEXT: s_endpgm
2300 ; GFX11-LABEL: v_test_v2i16_x_add_neg_negfpone:
2302 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
2303 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2304 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2305 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
2306 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2307 ; GFX11-NEXT: v_pk_sub_u16 v1, v1, 0xbc00 op_sel_hi:[1,0]
2308 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
2309 ; GFX11-NEXT: s_nop 0
2310 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2311 ; GFX11-NEXT: s_endpgm
2312 %tid = call i32 @llvm.amdgcn.workitem.id.x()
2313 %tid.ext = sext i32 %tid to i64
2314 %gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
2315 %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
2316 %x = load <2 x i16>, ptr addrspace(1) %gep
2317 %result = add <2 x i16> %x, <i16 17408, i16 17408>
2318 store <2 x i16> %result, ptr addrspace(1) %gep.out
2322 define amdgpu_kernel void @v_test_v2i16_x_add_neg_fptwo(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
2323 ; SI-LABEL: v_test_v2i16_x_add_neg_fptwo:
2325 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
2326 ; SI-NEXT: s_mov_b32 s7, 0xf000
2327 ; SI-NEXT: s_mov_b32 s6, 0
2328 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2329 ; SI-NEXT: v_mov_b32_e32 v1, 0
2330 ; SI-NEXT: s_waitcnt lgkmcnt(0)
2331 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
2332 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
2333 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
2334 ; SI-NEXT: s_waitcnt vmcnt(0)
2335 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4000, v2
2336 ; SI-NEXT: s_mov_b32 s4, 0xffff0000
2337 ; SI-NEXT: v_bfi_b32 v2, s4, v2, v3
2338 ; SI-NEXT: v_add_i32_e32 v2, vcc, 2.0, v2
2339 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
2342 ; VI-LABEL: v_test_v2i16_x_add_neg_fptwo:
2344 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
2345 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
2346 ; VI-NEXT: v_mov_b32_e32 v4, 0x4000
2347 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2348 ; VI-NEXT: v_mov_b32_e32 v1, s3
2349 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
2350 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2351 ; VI-NEXT: flat_load_dword v3, v[0:1]
2352 ; VI-NEXT: v_mov_b32_e32 v1, s1
2353 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
2354 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2355 ; VI-NEXT: s_waitcnt vmcnt(0)
2356 ; VI-NEXT: v_add_u16_e32 v2, 0x4000, v3
2357 ; VI-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
2358 ; VI-NEXT: v_or_b32_e32 v2, v2, v3
2359 ; VI-NEXT: flat_store_dword v[0:1], v2
2362 ; GFX9-LABEL: v_test_v2i16_x_add_neg_fptwo:
2364 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
2365 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2366 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2367 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
2368 ; GFX9-NEXT: s_mov_b32 s2, 0xc000c000
2369 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2370 ; GFX9-NEXT: v_pk_sub_u16 v1, v1, s2
2371 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
2372 ; GFX9-NEXT: s_endpgm
2374 ; GFX10-LABEL: v_test_v2i16_x_add_neg_fptwo:
2376 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
2377 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2378 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
2379 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
2380 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2381 ; GFX10-NEXT: v_pk_sub_u16 v1, v1, 0xc000 op_sel_hi:[1,0]
2382 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
2383 ; GFX10-NEXT: s_endpgm
2385 ; GFX11-LABEL: v_test_v2i16_x_add_neg_fptwo:
2387 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
2388 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2389 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2390 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
2391 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2392 ; GFX11-NEXT: v_pk_sub_u16 v1, v1, 0xc000 op_sel_hi:[1,0]
2393 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
2394 ; GFX11-NEXT: s_nop 0
2395 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2396 ; GFX11-NEXT: s_endpgm
2397 %tid = call i32 @llvm.amdgcn.workitem.id.x()
2398 %tid.ext = sext i32 %tid to i64
2399 %gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
2400 %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
2401 %x = load <2 x i16>, ptr addrspace(1) %gep
2402 %result = add <2 x i16> %x, <i16 16384, i16 16384>
2403 store <2 x i16> %result, ptr addrspace(1) %gep.out
2407 define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfptwo(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
2408 ; SI-LABEL: v_test_v2i16_x_add_neg_negfptwo:
2410 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
2411 ; SI-NEXT: s_mov_b32 s7, 0xf000
2412 ; SI-NEXT: s_mov_b32 s6, 0
2413 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2414 ; SI-NEXT: v_mov_b32_e32 v1, 0
2415 ; SI-NEXT: s_waitcnt lgkmcnt(0)
2416 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
2417 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
2418 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
2419 ; SI-NEXT: s_waitcnt vmcnt(0)
2420 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0xffffc000, v2
2421 ; SI-NEXT: s_mov_b32 s4, 0xffff0000
2422 ; SI-NEXT: v_bfi_b32 v2, s4, v2, v3
2423 ; SI-NEXT: v_add_i32_e32 v2, vcc, -2.0, v2
2424 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
2427 ; VI-LABEL: v_test_v2i16_x_add_neg_negfptwo:
2429 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
2430 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
2431 ; VI-NEXT: v_mov_b32_e32 v4, 0xffffc000
2432 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2433 ; VI-NEXT: v_mov_b32_e32 v1, s3
2434 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
2435 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2436 ; VI-NEXT: flat_load_dword v3, v[0:1]
2437 ; VI-NEXT: v_mov_b32_e32 v1, s1
2438 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
2439 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2440 ; VI-NEXT: s_waitcnt vmcnt(0)
2441 ; VI-NEXT: v_add_u16_e32 v2, 0xc000, v3
2442 ; VI-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
2443 ; VI-NEXT: v_or_b32_e32 v2, v2, v3
2444 ; VI-NEXT: flat_store_dword v[0:1], v2
2447 ; GFX9-LABEL: v_test_v2i16_x_add_neg_negfptwo:
2449 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
2450 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2451 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2452 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
2453 ; GFX9-NEXT: s_mov_b32 s2, 0x40004000
2454 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2455 ; GFX9-NEXT: v_pk_sub_u16 v1, v1, s2
2456 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
2457 ; GFX9-NEXT: s_endpgm
2459 ; GFX10-LABEL: v_test_v2i16_x_add_neg_negfptwo:
2461 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
2462 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2463 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
2464 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
2465 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2466 ; GFX10-NEXT: v_pk_sub_u16 v1, v1, 0x4000 op_sel_hi:[1,0]
2467 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
2468 ; GFX10-NEXT: s_endpgm
2470 ; GFX11-LABEL: v_test_v2i16_x_add_neg_negfptwo:
2472 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
2473 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2474 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2475 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
2476 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2477 ; GFX11-NEXT: v_pk_sub_u16 v1, v1, 0x4000 op_sel_hi:[1,0]
2478 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
2479 ; GFX11-NEXT: s_nop 0
2480 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2481 ; GFX11-NEXT: s_endpgm
2482 %tid = call i32 @llvm.amdgcn.workitem.id.x()
2483 %tid.ext = sext i32 %tid to i64
2484 %gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
2485 %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
2486 %x = load <2 x i16>, ptr addrspace(1) %gep
2487 %result = add <2 x i16> %x, <i16 -16384, i16 -16384>
2488 store <2 x i16> %result, ptr addrspace(1) %gep.out
2492 define amdgpu_kernel void @v_test_v2i16_x_add_undef_neg32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
2493 ; SI-LABEL: v_test_v2i16_x_add_undef_neg32:
2495 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
2496 ; SI-NEXT: s_mov_b32 s7, 0xf000
2497 ; SI-NEXT: s_mov_b32 s6, 0
2498 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2499 ; SI-NEXT: v_mov_b32_e32 v1, 0
2500 ; SI-NEXT: s_waitcnt lgkmcnt(0)
2501 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
2502 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
2503 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
2504 ; SI-NEXT: s_waitcnt vmcnt(0)
2505 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
2506 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0xffe00000, v2
2507 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
2510 ; VI-LABEL: v_test_v2i16_x_add_undef_neg32:
2512 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
2513 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
2514 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2515 ; VI-NEXT: v_mov_b32_e32 v1, s3
2516 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
2517 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2518 ; VI-NEXT: flat_load_dword v3, v[0:1]
2519 ; VI-NEXT: v_mov_b32_e32 v1, s1
2520 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
2521 ; VI-NEXT: v_mov_b32_e32 v2, 32
2522 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2523 ; VI-NEXT: s_waitcnt vmcnt(0)
2524 ; VI-NEXT: v_sub_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
2525 ; VI-NEXT: flat_store_dword v[0:1], v2
2528 ; GFX9-LABEL: v_test_v2i16_x_add_undef_neg32:
2530 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
2531 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2532 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2533 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
2534 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2535 ; GFX9-NEXT: v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0]
2536 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
2537 ; GFX9-NEXT: s_endpgm
2539 ; GFX10-LABEL: v_test_v2i16_x_add_undef_neg32:
2541 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
2542 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2543 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
2544 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
2545 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2546 ; GFX10-NEXT: v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0]
2547 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
2548 ; GFX10-NEXT: s_endpgm
2550 ; GFX11-LABEL: v_test_v2i16_x_add_undef_neg32:
2552 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
2553 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2554 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2555 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
2556 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2557 ; GFX11-NEXT: v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0]
2558 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
2559 ; GFX11-NEXT: s_nop 0
2560 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2561 ; GFX11-NEXT: s_endpgm
2562 %tid = call i32 @llvm.amdgcn.workitem.id.x()
2563 %tid.ext = sext i32 %tid to i64
2564 %gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
2565 %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
2566 %x = load <2 x i16>, ptr addrspace(1) %gep
2567 %result = add <2 x i16> %x, <i16 undef, i16 -32>
2568 store <2 x i16> %result, ptr addrspace(1) %gep.out
2572 define amdgpu_kernel void @v_test_v2i16_x_add_neg32_undef(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
2573 ; SI-LABEL: v_test_v2i16_x_add_neg32_undef:
2575 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
2576 ; SI-NEXT: s_mov_b32 s7, 0xf000
2577 ; SI-NEXT: s_mov_b32 s6, 0
2578 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2579 ; SI-NEXT: v_mov_b32_e32 v1, 0
2580 ; SI-NEXT: s_waitcnt lgkmcnt(0)
2581 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
2582 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
2583 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
2584 ; SI-NEXT: s_waitcnt vmcnt(0)
2585 ; SI-NEXT: v_subrev_i32_e32 v2, vcc, 32, v2
2586 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
2587 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
2590 ; VI-LABEL: v_test_v2i16_x_add_neg32_undef:
2592 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
2593 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
2594 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2595 ; VI-NEXT: v_mov_b32_e32 v1, s3
2596 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
2597 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2598 ; VI-NEXT: flat_load_dword v3, v[0:1]
2599 ; VI-NEXT: v_mov_b32_e32 v1, s1
2600 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
2601 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2602 ; VI-NEXT: s_waitcnt vmcnt(0)
2603 ; VI-NEXT: v_subrev_u16_e32 v2, 32, v3
2604 ; VI-NEXT: flat_store_dword v[0:1], v2
2607 ; GFX9-LABEL: v_test_v2i16_x_add_neg32_undef:
2609 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
2610 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2611 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2612 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
2613 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2614 ; GFX9-NEXT: v_pk_sub_u16 v1, v1, 32
2615 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
2616 ; GFX9-NEXT: s_endpgm
2618 ; GFX10-LABEL: v_test_v2i16_x_add_neg32_undef:
2620 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
2621 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2622 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
2623 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
2624 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2625 ; GFX10-NEXT: v_pk_sub_u16 v1, v1, 32
2626 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
2627 ; GFX10-NEXT: s_endpgm
2629 ; GFX11-LABEL: v_test_v2i16_x_add_neg32_undef:
2631 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
2632 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2633 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2634 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
2635 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2636 ; GFX11-NEXT: v_pk_sub_u16 v1, v1, 32
2637 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
2638 ; GFX11-NEXT: s_nop 0
2639 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2640 ; GFX11-NEXT: s_endpgm
2641 %tid = call i32 @llvm.amdgcn.workitem.id.x()
2642 %tid.ext = sext i32 %tid to i64
2643 %gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
2644 %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
2645 %x = load <2 x i16>, ptr addrspace(1) %gep
2646 %result = add <2 x i16> %x, <i16 -32, i16 undef>
2647 store <2 x i16> %result, ptr addrspace(1) %gep.out
2651 declare i32 @llvm.amdgcn.workitem.id.x() #1
2653 attributes #0 = { nounwind }
2654 attributes #1 = { nounwind readnone }