1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck %s --check-prefix=SI
3 ; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=VI
4 ; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck %s --check-prefix=GFX9
5 ; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s --check-prefix=GFX10
7 ; Test that add/sub with a constant is swapped to sub/add with negated
8 ; constant to minimize code size.
10 define amdgpu_kernel void @v_test_i32_x_sub_64(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
11 ; SI-LABEL: v_test_i32_x_sub_64:
13 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
14 ; SI-NEXT: s_mov_b32 s7, 0xf000
15 ; SI-NEXT: s_mov_b32 s6, 0
16 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
17 ; SI-NEXT: v_mov_b32_e32 v1, 0
18 ; SI-NEXT: s_waitcnt lgkmcnt(0)
19 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
20 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
21 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
22 ; SI-NEXT: s_waitcnt vmcnt(0)
23 ; SI-NEXT: v_subrev_i32_e32 v2, vcc, 64, v2
24 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
27 ; VI-LABEL: v_test_i32_x_sub_64:
29 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
30 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
31 ; VI-NEXT: s_waitcnt lgkmcnt(0)
32 ; VI-NEXT: v_mov_b32_e32 v1, s3
33 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
34 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
35 ; VI-NEXT: flat_load_dword v0, v[0:1]
36 ; VI-NEXT: v_mov_b32_e32 v3, s1
37 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
38 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
39 ; VI-NEXT: s_waitcnt vmcnt(0)
40 ; VI-NEXT: v_subrev_u32_e32 v0, vcc, 64, v0
41 ; VI-NEXT: flat_store_dword v[2:3], v0
44 ; GFX9-LABEL: v_test_i32_x_sub_64:
46 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
47 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
48 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
49 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
50 ; GFX9-NEXT: s_waitcnt vmcnt(0)
51 ; GFX9-NEXT: v_subrev_u32_e32 v1, 64, v1
52 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
55 ; GFX10-LABEL: v_test_i32_x_sub_64:
57 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
58 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
59 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
60 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
61 ; GFX10-NEXT: s_waitcnt vmcnt(0)
62 ; GFX10-NEXT: v_subrev_nc_u32_e32 v1, 64, v1
63 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
64 ; GFX10-NEXT: s_endpgm
65 %tid = call i32 @llvm.amdgcn.workitem.id.x()
66 %tid.ext = sext i32 %tid to i64
67 %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
68 %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
69 %x = load i32, i32 addrspace(1)* %gep
70 %result = sub i32 %x, 64
71 store i32 %result, i32 addrspace(1)* %gep.out
75 define amdgpu_kernel void @v_test_i32_x_sub_64_multi_use(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
76 ; SI-LABEL: v_test_i32_x_sub_64_multi_use:
78 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
79 ; SI-NEXT: s_mov_b32 s7, 0xf000
80 ; SI-NEXT: s_mov_b32 s6, 0
81 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
82 ; SI-NEXT: v_mov_b32_e32 v1, 0
83 ; SI-NEXT: s_waitcnt lgkmcnt(0)
84 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
85 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
86 ; SI-NEXT: s_waitcnt vmcnt(0)
87 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc
88 ; SI-NEXT: s_waitcnt vmcnt(0)
89 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
90 ; SI-NEXT: v_subrev_i32_e32 v2, vcc, 64, v2
91 ; SI-NEXT: v_subrev_i32_e32 v3, vcc, 64, v3
92 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
93 ; SI-NEXT: s_waitcnt vmcnt(0)
94 ; SI-NEXT: buffer_store_dword v3, v[0:1], s[0:3], 0 addr64
95 ; SI-NEXT: s_waitcnt vmcnt(0)
98 ; VI-LABEL: v_test_i32_x_sub_64_multi_use:
100 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
101 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
102 ; VI-NEXT: s_waitcnt lgkmcnt(0)
103 ; VI-NEXT: v_mov_b32_e32 v1, s3
104 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
105 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
106 ; VI-NEXT: flat_load_dword v3, v[0:1] glc
107 ; VI-NEXT: s_waitcnt vmcnt(0)
108 ; VI-NEXT: flat_load_dword v4, v[0:1] glc
109 ; VI-NEXT: s_waitcnt vmcnt(0)
110 ; VI-NEXT: v_mov_b32_e32 v1, s1
111 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
112 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
113 ; VI-NEXT: v_subrev_u32_e32 v2, vcc, 64, v3
114 ; VI-NEXT: v_subrev_u32_e32 v3, vcc, 64, v4
115 ; VI-NEXT: flat_store_dword v[0:1], v2
116 ; VI-NEXT: s_waitcnt vmcnt(0)
117 ; VI-NEXT: flat_store_dword v[0:1], v3
118 ; VI-NEXT: s_waitcnt vmcnt(0)
121 ; GFX9-LABEL: v_test_i32_x_sub_64_multi_use:
123 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
124 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
125 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
126 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
127 ; GFX9-NEXT: s_waitcnt vmcnt(0)
128 ; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc
129 ; GFX9-NEXT: s_waitcnt vmcnt(0)
130 ; GFX9-NEXT: v_subrev_u32_e32 v1, 64, v1
131 ; GFX9-NEXT: v_subrev_u32_e32 v2, 64, v2
132 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
133 ; GFX9-NEXT: s_waitcnt vmcnt(0)
134 ; GFX9-NEXT: global_store_dword v0, v2, s[0:1]
135 ; GFX9-NEXT: s_waitcnt vmcnt(0)
136 ; GFX9-NEXT: s_endpgm
138 ; GFX10-LABEL: v_test_i32_x_sub_64_multi_use:
140 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
141 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
142 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
143 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc
144 ; GFX10-NEXT: s_waitcnt vmcnt(0)
145 ; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc
146 ; GFX10-NEXT: s_waitcnt vmcnt(0)
147 ; GFX10-NEXT: v_subrev_nc_u32_e32 v1, 64, v1
148 ; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 64, v2
149 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
150 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
151 ; GFX10-NEXT: global_store_dword v0, v2, s[0:1]
152 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
153 ; GFX10-NEXT: s_endpgm
154 %tid = call i32 @llvm.amdgcn.workitem.id.x()
155 %tid.ext = sext i32 %tid to i64
156 %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
157 %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
158 %x = load volatile i32, i32 addrspace(1)* %gep
159 %y = load volatile i32, i32 addrspace(1)* %gep
160 %result0 = sub i32 %x, 64
161 %result1 = sub i32 %y, 64
162 store volatile i32 %result0, i32 addrspace(1)* %gep.out
163 store volatile i32 %result1, i32 addrspace(1)* %gep.out
167 define amdgpu_kernel void @v_test_i32_64_sub_x(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
168 ; SI-LABEL: v_test_i32_64_sub_x:
170 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
171 ; SI-NEXT: s_mov_b32 s7, 0xf000
172 ; SI-NEXT: s_mov_b32 s6, 0
173 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
174 ; SI-NEXT: v_mov_b32_e32 v1, 0
175 ; SI-NEXT: s_waitcnt lgkmcnt(0)
176 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
177 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
178 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
179 ; SI-NEXT: s_waitcnt vmcnt(0)
180 ; SI-NEXT: v_sub_i32_e32 v2, vcc, 64, v2
181 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
184 ; VI-LABEL: v_test_i32_64_sub_x:
186 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
187 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
188 ; VI-NEXT: s_waitcnt lgkmcnt(0)
189 ; VI-NEXT: v_mov_b32_e32 v1, s3
190 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
191 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
192 ; VI-NEXT: flat_load_dword v0, v[0:1]
193 ; VI-NEXT: v_mov_b32_e32 v3, s1
194 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
195 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
196 ; VI-NEXT: s_waitcnt vmcnt(0)
197 ; VI-NEXT: v_sub_u32_e32 v0, vcc, 64, v0
198 ; VI-NEXT: flat_store_dword v[2:3], v0
201 ; GFX9-LABEL: v_test_i32_64_sub_x:
203 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
204 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
205 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
206 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
207 ; GFX9-NEXT: s_waitcnt vmcnt(0)
208 ; GFX9-NEXT: v_sub_u32_e32 v1, 64, v1
209 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
210 ; GFX9-NEXT: s_endpgm
212 ; GFX10-LABEL: v_test_i32_64_sub_x:
214 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
215 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
216 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
217 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
218 ; GFX10-NEXT: s_waitcnt vmcnt(0)
219 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, 64, v1
220 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
221 ; GFX10-NEXT: s_endpgm
222 %tid = call i32 @llvm.amdgcn.workitem.id.x()
223 %tid.ext = sext i32 %tid to i64
224 %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
225 %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
226 %x = load i32, i32 addrspace(1)* %gep
227 %result = sub i32 64, %x
228 store i32 %result, i32 addrspace(1)* %gep.out
232 define amdgpu_kernel void @v_test_i32_x_sub_65(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
233 ; SI-LABEL: v_test_i32_x_sub_65:
235 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
236 ; SI-NEXT: s_mov_b32 s7, 0xf000
237 ; SI-NEXT: s_mov_b32 s6, 0
238 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
239 ; SI-NEXT: v_mov_b32_e32 v1, 0
240 ; SI-NEXT: s_waitcnt lgkmcnt(0)
241 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
242 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
243 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
244 ; SI-NEXT: s_waitcnt vmcnt(0)
245 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0xffffffbf, v2
246 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
249 ; VI-LABEL: v_test_i32_x_sub_65:
251 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
252 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
253 ; VI-NEXT: s_waitcnt lgkmcnt(0)
254 ; VI-NEXT: v_mov_b32_e32 v1, s3
255 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
256 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
257 ; VI-NEXT: flat_load_dword v0, v[0:1]
258 ; VI-NEXT: v_mov_b32_e32 v3, s1
259 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
260 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
261 ; VI-NEXT: s_waitcnt vmcnt(0)
262 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0xffffffbf, v0
263 ; VI-NEXT: flat_store_dword v[2:3], v0
266 ; GFX9-LABEL: v_test_i32_x_sub_65:
268 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
269 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
270 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
271 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
272 ; GFX9-NEXT: s_waitcnt vmcnt(0)
273 ; GFX9-NEXT: v_add_u32_e32 v1, 0xffffffbf, v1
274 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
275 ; GFX9-NEXT: s_endpgm
277 ; GFX10-LABEL: v_test_i32_x_sub_65:
279 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
280 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
281 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
282 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
283 ; GFX10-NEXT: s_waitcnt vmcnt(0)
284 ; GFX10-NEXT: v_add_nc_u32_e32 v1, 0xffffffbf, v1
285 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
286 ; GFX10-NEXT: s_endpgm
287 %tid = call i32 @llvm.amdgcn.workitem.id.x()
288 %tid.ext = sext i32 %tid to i64
289 %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
290 %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
291 %x = load i32, i32 addrspace(1)* %gep
292 %result = sub i32 %x, 65
293 store i32 %result, i32 addrspace(1)* %gep.out
297 define amdgpu_kernel void @v_test_i32_65_sub_x(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
298 ; SI-LABEL: v_test_i32_65_sub_x:
300 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
301 ; SI-NEXT: s_mov_b32 s7, 0xf000
302 ; SI-NEXT: s_mov_b32 s6, 0
303 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
304 ; SI-NEXT: v_mov_b32_e32 v1, 0
305 ; SI-NEXT: s_waitcnt lgkmcnt(0)
306 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
307 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
308 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
309 ; SI-NEXT: s_waitcnt vmcnt(0)
310 ; SI-NEXT: v_sub_i32_e32 v2, vcc, 0x41, v2
311 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
314 ; VI-LABEL: v_test_i32_65_sub_x:
316 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
317 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
318 ; VI-NEXT: s_waitcnt lgkmcnt(0)
319 ; VI-NEXT: v_mov_b32_e32 v1, s3
320 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
321 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
322 ; VI-NEXT: flat_load_dword v0, v[0:1]
323 ; VI-NEXT: v_mov_b32_e32 v3, s1
324 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
325 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
326 ; VI-NEXT: s_waitcnt vmcnt(0)
327 ; VI-NEXT: v_sub_u32_e32 v0, vcc, 0x41, v0
328 ; VI-NEXT: flat_store_dword v[2:3], v0
331 ; GFX9-LABEL: v_test_i32_65_sub_x:
333 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
334 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
335 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
336 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
337 ; GFX9-NEXT: s_waitcnt vmcnt(0)
338 ; GFX9-NEXT: v_sub_u32_e32 v1, 0x41, v1
339 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
340 ; GFX9-NEXT: s_endpgm
342 ; GFX10-LABEL: v_test_i32_65_sub_x:
344 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
345 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
346 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
347 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
348 ; GFX10-NEXT: s_waitcnt vmcnt(0)
349 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, 0x41, v1
350 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
351 ; GFX10-NEXT: s_endpgm
352 %tid = call i32 @llvm.amdgcn.workitem.id.x()
353 %tid.ext = sext i32 %tid to i64
354 %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
355 %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
356 %x = load i32, i32 addrspace(1)* %gep
357 %result = sub i32 65, %x
358 store i32 %result, i32 addrspace(1)* %gep.out
362 define amdgpu_kernel void @v_test_i32_x_sub_neg16(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
363 ; SI-LABEL: v_test_i32_x_sub_neg16:
365 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
366 ; SI-NEXT: s_mov_b32 s7, 0xf000
367 ; SI-NEXT: s_mov_b32 s6, 0
368 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
369 ; SI-NEXT: v_mov_b32_e32 v1, 0
370 ; SI-NEXT: s_waitcnt lgkmcnt(0)
371 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
372 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
373 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
374 ; SI-NEXT: s_waitcnt vmcnt(0)
375 ; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v2
376 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
379 ; VI-LABEL: v_test_i32_x_sub_neg16:
381 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
382 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
383 ; VI-NEXT: s_waitcnt lgkmcnt(0)
384 ; VI-NEXT: v_mov_b32_e32 v1, s3
385 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
386 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
387 ; VI-NEXT: flat_load_dword v0, v[0:1]
388 ; VI-NEXT: v_mov_b32_e32 v3, s1
389 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
390 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
391 ; VI-NEXT: s_waitcnt vmcnt(0)
392 ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0
393 ; VI-NEXT: flat_store_dword v[2:3], v0
396 ; GFX9-LABEL: v_test_i32_x_sub_neg16:
398 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
399 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
400 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
401 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
402 ; GFX9-NEXT: s_waitcnt vmcnt(0)
403 ; GFX9-NEXT: v_add_u32_e32 v1, 16, v1
404 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
405 ; GFX9-NEXT: s_endpgm
407 ; GFX10-LABEL: v_test_i32_x_sub_neg16:
409 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
410 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
411 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
412 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
413 ; GFX10-NEXT: s_waitcnt vmcnt(0)
414 ; GFX10-NEXT: v_add_nc_u32_e32 v1, 16, v1
415 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
416 ; GFX10-NEXT: s_endpgm
417 %tid = call i32 @llvm.amdgcn.workitem.id.x()
418 %tid.ext = sext i32 %tid to i64
419 %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
420 %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
421 %x = load i32, i32 addrspace(1)* %gep
422 %result = sub i32 %x, -16
423 store i32 %result, i32 addrspace(1)* %gep.out
427 define amdgpu_kernel void @v_test_i32_neg16_sub_x(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
428 ; SI-LABEL: v_test_i32_neg16_sub_x:
430 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
431 ; SI-NEXT: s_mov_b32 s7, 0xf000
432 ; SI-NEXT: s_mov_b32 s6, 0
433 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
434 ; SI-NEXT: v_mov_b32_e32 v1, 0
435 ; SI-NEXT: s_waitcnt lgkmcnt(0)
436 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
437 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
438 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
439 ; SI-NEXT: s_waitcnt vmcnt(0)
440 ; SI-NEXT: v_sub_i32_e32 v2, vcc, -16, v2
441 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
444 ; VI-LABEL: v_test_i32_neg16_sub_x:
446 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
447 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
448 ; VI-NEXT: s_waitcnt lgkmcnt(0)
449 ; VI-NEXT: v_mov_b32_e32 v1, s3
450 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
451 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
452 ; VI-NEXT: flat_load_dword v0, v[0:1]
453 ; VI-NEXT: v_mov_b32_e32 v3, s1
454 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
455 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
456 ; VI-NEXT: s_waitcnt vmcnt(0)
457 ; VI-NEXT: v_sub_u32_e32 v0, vcc, -16, v0
458 ; VI-NEXT: flat_store_dword v[2:3], v0
461 ; GFX9-LABEL: v_test_i32_neg16_sub_x:
463 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
464 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
465 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
466 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
467 ; GFX9-NEXT: s_waitcnt vmcnt(0)
468 ; GFX9-NEXT: v_sub_u32_e32 v1, -16, v1
469 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
470 ; GFX9-NEXT: s_endpgm
472 ; GFX10-LABEL: v_test_i32_neg16_sub_x:
474 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
475 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
476 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
477 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
478 ; GFX10-NEXT: s_waitcnt vmcnt(0)
479 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, -16, v1
480 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
481 ; GFX10-NEXT: s_endpgm
482 %tid = call i32 @llvm.amdgcn.workitem.id.x()
483 %tid.ext = sext i32 %tid to i64
484 %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
485 %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
486 %x = load i32, i32 addrspace(1)* %gep
487 %result = sub i32 -16, %x
488 store i32 %result, i32 addrspace(1)* %gep.out
492 define amdgpu_kernel void @v_test_i32_x_sub_neg17(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
493 ; SI-LABEL: v_test_i32_x_sub_neg17:
495 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
496 ; SI-NEXT: s_mov_b32 s7, 0xf000
497 ; SI-NEXT: s_mov_b32 s6, 0
498 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
499 ; SI-NEXT: v_mov_b32_e32 v1, 0
500 ; SI-NEXT: s_waitcnt lgkmcnt(0)
501 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
502 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
503 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
504 ; SI-NEXT: s_waitcnt vmcnt(0)
505 ; SI-NEXT: v_add_i32_e32 v2, vcc, 17, v2
506 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
509 ; VI-LABEL: v_test_i32_x_sub_neg17:
511 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
512 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
513 ; VI-NEXT: s_waitcnt lgkmcnt(0)
514 ; VI-NEXT: v_mov_b32_e32 v1, s3
515 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
516 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
517 ; VI-NEXT: flat_load_dword v0, v[0:1]
518 ; VI-NEXT: v_mov_b32_e32 v3, s1
519 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
520 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
521 ; VI-NEXT: s_waitcnt vmcnt(0)
522 ; VI-NEXT: v_add_u32_e32 v0, vcc, 17, v0
523 ; VI-NEXT: flat_store_dword v[2:3], v0
526 ; GFX9-LABEL: v_test_i32_x_sub_neg17:
528 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
529 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
530 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
531 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
532 ; GFX9-NEXT: s_waitcnt vmcnt(0)
533 ; GFX9-NEXT: v_add_u32_e32 v1, 17, v1
534 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
535 ; GFX9-NEXT: s_endpgm
537 ; GFX10-LABEL: v_test_i32_x_sub_neg17:
539 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
540 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
541 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
542 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
543 ; GFX10-NEXT: s_waitcnt vmcnt(0)
544 ; GFX10-NEXT: v_add_nc_u32_e32 v1, 17, v1
545 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
546 ; GFX10-NEXT: s_endpgm
547 %tid = call i32 @llvm.amdgcn.workitem.id.x()
548 %tid.ext = sext i32 %tid to i64
549 %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
550 %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
551 %x = load i32, i32 addrspace(1)* %gep
552 %result = sub i32 %x, -17
553 store i32 %result, i32 addrspace(1)* %gep.out
557 define amdgpu_kernel void @v_test_i32_neg17_sub_x(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
558 ; SI-LABEL: v_test_i32_neg17_sub_x:
560 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
561 ; SI-NEXT: s_mov_b32 s7, 0xf000
562 ; SI-NEXT: s_mov_b32 s6, 0
563 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
564 ; SI-NEXT: v_mov_b32_e32 v1, 0
565 ; SI-NEXT: s_waitcnt lgkmcnt(0)
566 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
567 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
568 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
569 ; SI-NEXT: s_waitcnt vmcnt(0)
570 ; SI-NEXT: v_sub_i32_e32 v2, vcc, 0xffffffef, v2
571 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
574 ; VI-LABEL: v_test_i32_neg17_sub_x:
576 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
577 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
578 ; VI-NEXT: s_waitcnt lgkmcnt(0)
579 ; VI-NEXT: v_mov_b32_e32 v1, s3
580 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
581 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
582 ; VI-NEXT: flat_load_dword v0, v[0:1]
583 ; VI-NEXT: v_mov_b32_e32 v3, s1
584 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
585 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
586 ; VI-NEXT: s_waitcnt vmcnt(0)
587 ; VI-NEXT: v_sub_u32_e32 v0, vcc, 0xffffffef, v0
588 ; VI-NEXT: flat_store_dword v[2:3], v0
591 ; GFX9-LABEL: v_test_i32_neg17_sub_x:
593 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
594 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
595 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
596 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
597 ; GFX9-NEXT: s_waitcnt vmcnt(0)
598 ; GFX9-NEXT: v_sub_u32_e32 v1, 0xffffffef, v1
599 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
600 ; GFX9-NEXT: s_endpgm
602 ; GFX10-LABEL: v_test_i32_neg17_sub_x:
604 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
605 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
606 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
607 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
608 ; GFX10-NEXT: s_waitcnt vmcnt(0)
609 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, 0xffffffef, v1
610 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
611 ; GFX10-NEXT: s_endpgm
612 %tid = call i32 @llvm.amdgcn.workitem.id.x()
613 %tid.ext = sext i32 %tid to i64
614 %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
615 %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
616 %x = load i32, i32 addrspace(1)* %gep
617 %result = sub i32 -17, %x
618 store i32 %result, i32 addrspace(1)* %gep.out
622 define amdgpu_kernel void @s_test_i32_x_sub_64(i32 %x) #0 {
623 ; SI-LABEL: s_test_i32_x_sub_64:
625 ; SI-NEXT: s_load_dword s0, s[0:1], 0x9
626 ; SI-NEXT: s_waitcnt lgkmcnt(0)
627 ; SI-NEXT: s_sub_i32 s0, s0, 64
628 ; SI-NEXT: ;;#ASMSTART
633 ; VI-LABEL: s_test_i32_x_sub_64:
635 ; VI-NEXT: s_load_dword s0, s[0:1], 0x24
636 ; VI-NEXT: s_waitcnt lgkmcnt(0)
637 ; VI-NEXT: s_sub_i32 s0, s0, 64
638 ; VI-NEXT: ;;#ASMSTART
643 ; GFX9-LABEL: s_test_i32_x_sub_64:
645 ; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24
646 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
647 ; GFX9-NEXT: s_sub_i32 s0, s0, 64
648 ; GFX9-NEXT: ;;#ASMSTART
649 ; GFX9-NEXT: ; use s0
650 ; GFX9-NEXT: ;;#ASMEND
651 ; GFX9-NEXT: s_endpgm
653 ; GFX10-LABEL: s_test_i32_x_sub_64:
655 ; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24
656 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
657 ; GFX10-NEXT: s_sub_i32 s0, s0, 64
658 ; GFX10-NEXT: ;;#ASMSTART
659 ; GFX10-NEXT: ; use s0
660 ; GFX10-NEXT: ;;#ASMEND
661 ; GFX10-NEXT: s_endpgm
662 %result = sub i32 %x, 64
663 call void asm sideeffect "; use $0", "s"(i32 %result)
667 define amdgpu_kernel void @v_test_i16_x_sub_64(i16 addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
668 ; SI-LABEL: v_test_i16_x_sub_64:
670 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
671 ; SI-NEXT: s_mov_b32 s7, 0xf000
672 ; SI-NEXT: s_mov_b32 s6, 0
673 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
674 ; SI-NEXT: v_mov_b32_e32 v1, 0
675 ; SI-NEXT: s_waitcnt lgkmcnt(0)
676 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
677 ; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64
678 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
679 ; SI-NEXT: s_waitcnt vmcnt(0)
680 ; SI-NEXT: v_subrev_i32_e32 v2, vcc, 64, v2
681 ; SI-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64
684 ; VI-LABEL: v_test_i16_x_sub_64:
686 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
687 ; VI-NEXT: v_lshlrev_b32_e32 v2, 1, v0
688 ; VI-NEXT: s_waitcnt lgkmcnt(0)
689 ; VI-NEXT: v_mov_b32_e32 v1, s3
690 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
691 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
692 ; VI-NEXT: flat_load_ushort v0, v[0:1]
693 ; VI-NEXT: v_mov_b32_e32 v3, s1
694 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
695 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
696 ; VI-NEXT: s_waitcnt vmcnt(0)
697 ; VI-NEXT: v_subrev_u16_e32 v0, 64, v0
698 ; VI-NEXT: flat_store_short v[2:3], v0
701 ; GFX9-LABEL: v_test_i16_x_sub_64:
703 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
704 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
705 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
706 ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3]
707 ; GFX9-NEXT: s_waitcnt vmcnt(0)
708 ; GFX9-NEXT: v_subrev_u16_e32 v1, 64, v1
709 ; GFX9-NEXT: global_store_short v0, v1, s[0:1]
710 ; GFX9-NEXT: s_endpgm
712 ; GFX10-LABEL: v_test_i16_x_sub_64:
714 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
715 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0
716 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
717 ; GFX10-NEXT: global_load_ushort v1, v0, s[2:3]
718 ; GFX10-NEXT: s_waitcnt vmcnt(0)
719 ; GFX10-NEXT: v_sub_nc_u16 v1, v1, 64
720 ; GFX10-NEXT: global_store_short v0, v1, s[0:1]
721 ; GFX10-NEXT: s_endpgm
722 %tid = call i32 @llvm.amdgcn.workitem.id.x()
723 %tid.ext = sext i32 %tid to i64
724 %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 %tid.ext
725 %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i64 %tid.ext
726 %x = load i16, i16 addrspace(1)* %gep
727 %result = sub i16 %x, 64
728 store i16 %result, i16 addrspace(1)* %gep.out
732 define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
733 ; SI-LABEL: v_test_i16_x_sub_64_zext_to_i32:
735 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
736 ; SI-NEXT: s_mov_b32 s7, 0xf000
737 ; SI-NEXT: s_mov_b32 s6, 0
738 ; SI-NEXT: v_lshlrev_b32_e32 v1, 1, v0
739 ; SI-NEXT: v_mov_b32_e32 v2, 0
740 ; SI-NEXT: s_waitcnt lgkmcnt(0)
741 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
742 ; SI-NEXT: buffer_load_ushort v3, v[1:2], s[4:7], 0 addr64
743 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
744 ; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
745 ; SI-NEXT: s_waitcnt vmcnt(0)
746 ; SI-NEXT: v_subrev_i32_e32 v0, vcc, 64, v3
747 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
748 ; SI-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
751 ; VI-LABEL: v_test_i16_x_sub_64_zext_to_i32:
753 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
754 ; VI-NEXT: v_lshlrev_b32_e32 v1, 1, v0
755 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
756 ; VI-NEXT: s_waitcnt lgkmcnt(0)
757 ; VI-NEXT: v_mov_b32_e32 v2, s3
758 ; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1
759 ; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
760 ; VI-NEXT: v_add_u32_e32 v3, vcc, s0, v0
761 ; VI-NEXT: flat_load_ushort v0, v[1:2]
762 ; VI-NEXT: v_mov_b32_e32 v4, s1
763 ; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc
764 ; VI-NEXT: s_waitcnt vmcnt(0)
765 ; VI-NEXT: v_subrev_u16_e32 v0, 64, v0
766 ; VI-NEXT: flat_store_dword v[3:4], v0
769 ; GFX9-LABEL: v_test_i16_x_sub_64_zext_to_i32:
771 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
772 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v0
773 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
774 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
775 ; GFX9-NEXT: global_load_ushort v1, v1, s[2:3]
776 ; GFX9-NEXT: s_waitcnt vmcnt(0)
777 ; GFX9-NEXT: v_subrev_u16_e32 v1, 64, v1
778 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
779 ; GFX9-NEXT: s_endpgm
781 ; GFX10-LABEL: v_test_i16_x_sub_64_zext_to_i32:
783 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
784 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 1, v0
785 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
786 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
787 ; GFX10-NEXT: global_load_ushort v1, v1, s[2:3]
788 ; GFX10-NEXT: s_waitcnt vmcnt(0)
789 ; GFX10-NEXT: v_sub_nc_u16 v1, v1, 64
790 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1
791 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
792 ; GFX10-NEXT: s_endpgm
793 %tid = call i32 @llvm.amdgcn.workitem.id.x()
794 %tid.ext = sext i32 %tid to i64
795 %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 %tid.ext
796 %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
797 %x = load i16, i16 addrspace(1)* %gep
798 %result = sub i16 %x, 64
799 %zext = zext i16 %result to i32
800 store i32 %zext, i32 addrspace(1)* %gep.out
804 define amdgpu_kernel void @v_test_i16_x_sub_64_multi_use(i16 addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
805 ; SI-LABEL: v_test_i16_x_sub_64_multi_use:
807 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
808 ; SI-NEXT: s_mov_b32 s7, 0xf000
809 ; SI-NEXT: s_mov_b32 s6, 0
810 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
811 ; SI-NEXT: v_mov_b32_e32 v1, 0
812 ; SI-NEXT: s_waitcnt lgkmcnt(0)
813 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
814 ; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc
815 ; SI-NEXT: s_waitcnt vmcnt(0)
816 ; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 glc
817 ; SI-NEXT: s_waitcnt vmcnt(0)
818 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
819 ; SI-NEXT: v_subrev_i32_e32 v2, vcc, 64, v2
820 ; SI-NEXT: v_subrev_i32_e32 v3, vcc, 64, v3
821 ; SI-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64
822 ; SI-NEXT: s_waitcnt vmcnt(0)
823 ; SI-NEXT: buffer_store_short v3, v[0:1], s[0:3], 0 addr64
824 ; SI-NEXT: s_waitcnt vmcnt(0)
827 ; VI-LABEL: v_test_i16_x_sub_64_multi_use:
829 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
830 ; VI-NEXT: v_lshlrev_b32_e32 v2, 1, v0
831 ; VI-NEXT: s_waitcnt lgkmcnt(0)
832 ; VI-NEXT: v_mov_b32_e32 v1, s3
833 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
834 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
835 ; VI-NEXT: flat_load_ushort v3, v[0:1] glc
836 ; VI-NEXT: s_waitcnt vmcnt(0)
837 ; VI-NEXT: flat_load_ushort v4, v[0:1] glc
838 ; VI-NEXT: s_waitcnt vmcnt(0)
839 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
840 ; VI-NEXT: v_mov_b32_e32 v1, s1
841 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
842 ; VI-NEXT: v_subrev_u16_e32 v2, 64, v3
843 ; VI-NEXT: v_subrev_u16_e32 v3, 64, v4
844 ; VI-NEXT: flat_store_short v[0:1], v2
845 ; VI-NEXT: s_waitcnt vmcnt(0)
846 ; VI-NEXT: flat_store_short v[0:1], v3
847 ; VI-NEXT: s_waitcnt vmcnt(0)
850 ; GFX9-LABEL: v_test_i16_x_sub_64_multi_use:
852 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
853 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
854 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
855 ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc
856 ; GFX9-NEXT: s_waitcnt vmcnt(0)
857 ; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] glc
858 ; GFX9-NEXT: s_waitcnt vmcnt(0)
859 ; GFX9-NEXT: v_subrev_u16_e32 v1, 64, v1
860 ; GFX9-NEXT: v_subrev_u16_e32 v2, 64, v2
861 ; GFX9-NEXT: global_store_short v0, v1, s[0:1]
862 ; GFX9-NEXT: s_waitcnt vmcnt(0)
863 ; GFX9-NEXT: global_store_short v0, v2, s[0:1]
864 ; GFX9-NEXT: s_waitcnt vmcnt(0)
865 ; GFX9-NEXT: s_endpgm
867 ; GFX10-LABEL: v_test_i16_x_sub_64_multi_use:
869 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
870 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0
871 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
872 ; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
873 ; GFX10-NEXT: s_waitcnt vmcnt(0)
874 ; GFX10-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc
875 ; GFX10-NEXT: s_waitcnt vmcnt(0)
876 ; GFX10-NEXT: v_sub_nc_u16 v1, v1, 64
877 ; GFX10-NEXT: v_sub_nc_u16 v2, v2, 64
878 ; GFX10-NEXT: global_store_short v0, v1, s[0:1]
879 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
880 ; GFX10-NEXT: global_store_short v0, v2, s[0:1]
881 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
882 ; GFX10-NEXT: s_endpgm
883 %tid = call i32 @llvm.amdgcn.workitem.id.x()
884 %tid.ext = sext i32 %tid to i64
885 %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 %tid.ext
886 %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i64 %tid.ext
887 %x = load volatile i16, i16 addrspace(1)* %gep
888 %y = load volatile i16, i16 addrspace(1)* %gep
889 %result0 = sub i16 %x, 64
890 %result1 = sub i16 %y, 64
891 store volatile i16 %result0, i16 addrspace(1)* %gep.out
892 store volatile i16 %result1, i16 addrspace(1)* %gep.out
896 define amdgpu_kernel void @v_test_v2i16_x_sub_64_64(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
897 ; SI-LABEL: v_test_v2i16_x_sub_64_64:
899 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
900 ; SI-NEXT: s_mov_b32 s7, 0xf000
901 ; SI-NEXT: s_mov_b32 s6, 0
902 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
903 ; SI-NEXT: v_mov_b32_e32 v1, 0
904 ; SI-NEXT: s_waitcnt lgkmcnt(0)
905 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
906 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
907 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
908 ; SI-NEXT: s_waitcnt vmcnt(0)
909 ; SI-NEXT: v_subrev_i32_e32 v3, vcc, 64, v2
910 ; SI-NEXT: s_mov_b32 s4, 0xffff0000
911 ; SI-NEXT: v_bfi_b32 v2, s4, v2, v3
912 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0xffc00000, v2
913 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
916 ; VI-LABEL: v_test_v2i16_x_sub_64_64:
918 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
919 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
920 ; VI-NEXT: v_mov_b32_e32 v4, 64
921 ; VI-NEXT: s_waitcnt lgkmcnt(0)
922 ; VI-NEXT: v_mov_b32_e32 v1, s3
923 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
924 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
925 ; VI-NEXT: flat_load_dword v3, v[0:1]
926 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
927 ; VI-NEXT: v_mov_b32_e32 v1, s1
928 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
929 ; VI-NEXT: s_waitcnt vmcnt(0)
930 ; VI-NEXT: v_sub_u16_sdwa v2, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
931 ; VI-NEXT: v_subrev_u16_e32 v3, 64, v3
932 ; VI-NEXT: v_or_b32_e32 v2, v3, v2
933 ; VI-NEXT: flat_store_dword v[0:1], v2
936 ; GFX9-LABEL: v_test_v2i16_x_sub_64_64:
938 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
939 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
940 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
941 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
942 ; GFX9-NEXT: s_waitcnt vmcnt(0)
943 ; GFX9-NEXT: v_pk_sub_i16 v1, v1, 64 op_sel_hi:[1,0]
944 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
945 ; GFX9-NEXT: s_endpgm
947 ; GFX10-LABEL: v_test_v2i16_x_sub_64_64:
949 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
950 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
951 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
952 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
953 ; GFX10-NEXT: s_waitcnt vmcnt(0)
954 ; GFX10-NEXT: v_pk_sub_i16 v1, v1, 64 op_sel_hi:[1,0]
955 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
956 ; GFX10-NEXT: s_endpgm
957 %tid = call i32 @llvm.amdgcn.workitem.id.x()
958 %tid.ext = sext i32 %tid to i64
959 %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
960 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
961 %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep
962 %result = sub <2 x i16> %x, <i16 64, i16 64>
963 store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out
967 define amdgpu_kernel void @v_test_v2i16_x_sub_7_64(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
968 ; SI-LABEL: v_test_v2i16_x_sub_7_64:
970 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
971 ; SI-NEXT: s_mov_b32 s7, 0xf000
972 ; SI-NEXT: s_mov_b32 s6, 0
973 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
974 ; SI-NEXT: v_mov_b32_e32 v1, 0
975 ; SI-NEXT: s_waitcnt lgkmcnt(0)
976 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
977 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
978 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
979 ; SI-NEXT: s_waitcnt vmcnt(0)
980 ; SI-NEXT: v_add_i32_e32 v3, vcc, -7, v2
981 ; SI-NEXT: s_mov_b32 s4, 0xffff0000
982 ; SI-NEXT: v_bfi_b32 v2, s4, v2, v3
983 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0xffc00000, v2
984 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
987 ; VI-LABEL: v_test_v2i16_x_sub_7_64:
989 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
990 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
991 ; VI-NEXT: v_mov_b32_e32 v4, 64
992 ; VI-NEXT: s_waitcnt lgkmcnt(0)
993 ; VI-NEXT: v_mov_b32_e32 v1, s3
994 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
995 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
996 ; VI-NEXT: flat_load_dword v3, v[0:1]
997 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
998 ; VI-NEXT: v_mov_b32_e32 v1, s1
999 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1000 ; VI-NEXT: s_waitcnt vmcnt(0)
1001 ; VI-NEXT: v_add_u16_e32 v2, -7, v3
1002 ; VI-NEXT: v_sub_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1003 ; VI-NEXT: v_or_b32_e32 v2, v2, v3
1004 ; VI-NEXT: flat_store_dword v[0:1], v2
1007 ; GFX9-LABEL: v_test_v2i16_x_sub_7_64:
1009 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1010 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1011 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1012 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
1013 ; GFX9-NEXT: s_mov_b32 s2, 0x400007
1014 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1015 ; GFX9-NEXT: v_pk_sub_i16 v1, v1, s2
1016 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
1017 ; GFX9-NEXT: s_endpgm
1019 ; GFX10-LABEL: v_test_v2i16_x_sub_7_64:
1021 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1022 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1023 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1024 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
1025 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1026 ; GFX10-NEXT: v_pk_sub_i16 v1, v1, 0x400007
1027 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
1028 ; GFX10-NEXT: s_endpgm
1029 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1030 %tid.ext = sext i32 %tid to i64
1031 %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
1032 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
1033 %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep
1034 %result = sub <2 x i16> %x, <i16 7, i16 64>
1035 store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out
1039 define amdgpu_kernel void @v_test_v2i16_x_sub_64_123(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
1040 ; SI-LABEL: v_test_v2i16_x_sub_64_123:
1042 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1043 ; SI-NEXT: s_mov_b32 s7, 0xf000
1044 ; SI-NEXT: s_mov_b32 s6, 0
1045 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1046 ; SI-NEXT: v_mov_b32_e32 v1, 0
1047 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1048 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
1049 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1050 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
1051 ; SI-NEXT: s_waitcnt vmcnt(0)
1052 ; SI-NEXT: v_subrev_i32_e32 v3, vcc, 64, v2
1053 ; SI-NEXT: s_mov_b32 s4, 0xffff0000
1054 ; SI-NEXT: v_bfi_b32 v2, s4, v2, v3
1055 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0xff850000, v2
1056 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1059 ; VI-LABEL: v_test_v2i16_x_sub_64_123:
1061 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1062 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1063 ; VI-NEXT: v_mov_b32_e32 v4, 0xffffff85
1064 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1065 ; VI-NEXT: v_mov_b32_e32 v1, s3
1066 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1067 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1068 ; VI-NEXT: flat_load_dword v3, v[0:1]
1069 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
1070 ; VI-NEXT: v_mov_b32_e32 v1, s1
1071 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1072 ; VI-NEXT: s_waitcnt vmcnt(0)
1073 ; VI-NEXT: v_add_u16_sdwa v2, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1074 ; VI-NEXT: v_subrev_u16_e32 v3, 64, v3
1075 ; VI-NEXT: v_or_b32_e32 v2, v3, v2
1076 ; VI-NEXT: flat_store_dword v[0:1], v2
1079 ; GFX9-LABEL: v_test_v2i16_x_sub_64_123:
1081 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1082 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1083 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1084 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
1085 ; GFX9-NEXT: s_mov_b32 s2, 0x7b0040
1086 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1087 ; GFX9-NEXT: v_pk_sub_i16 v1, v1, s2
1088 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
1089 ; GFX9-NEXT: s_endpgm
1091 ; GFX10-LABEL: v_test_v2i16_x_sub_64_123:
1093 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1094 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1095 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1096 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
1097 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1098 ; GFX10-NEXT: v_pk_sub_i16 v1, v1, 0x7b0040
1099 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
1100 ; GFX10-NEXT: s_endpgm
1101 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1102 %tid.ext = sext i32 %tid to i64
1103 %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
1104 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
1105 %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep
1106 %result = sub <2 x i16> %x, <i16 64, i16 123>
1107 store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out
1111 ; Can fold 0 and inline immediate in other half.
1112 define amdgpu_kernel void @v_test_v2i16_x_sub_7_0(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
1113 ; SI-LABEL: v_test_v2i16_x_sub_7_0:
1115 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1116 ; SI-NEXT: s_mov_b32 s7, 0xf000
1117 ; SI-NEXT: s_mov_b32 s6, 0
1118 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1119 ; SI-NEXT: v_mov_b32_e32 v1, 0
1120 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1121 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
1122 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1123 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
1124 ; SI-NEXT: s_waitcnt vmcnt(0)
1125 ; SI-NEXT: v_add_i32_e32 v3, vcc, -7, v2
1126 ; SI-NEXT: s_mov_b32 s4, 0xffff
1127 ; SI-NEXT: v_bfi_b32 v2, s4, v3, v2
1128 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1131 ; VI-LABEL: v_test_v2i16_x_sub_7_0:
1133 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1134 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1135 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1136 ; VI-NEXT: v_mov_b32_e32 v1, s3
1137 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1138 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1139 ; VI-NEXT: flat_load_dword v3, v[0:1]
1140 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
1141 ; VI-NEXT: v_mov_b32_e32 v1, s1
1142 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1143 ; VI-NEXT: s_waitcnt vmcnt(0)
1144 ; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
1145 ; VI-NEXT: v_add_u16_e32 v3, -7, v3
1146 ; VI-NEXT: v_or_b32_e32 v2, v3, v2
1147 ; VI-NEXT: flat_store_dword v[0:1], v2
1150 ; GFX9-LABEL: v_test_v2i16_x_sub_7_0:
1152 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1153 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1154 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1155 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
1156 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1157 ; GFX9-NEXT: v_pk_sub_i16 v1, v1, 7
1158 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
1159 ; GFX9-NEXT: s_endpgm
1161 ; GFX10-LABEL: v_test_v2i16_x_sub_7_0:
1163 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1164 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1165 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1166 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
1167 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1168 ; GFX10-NEXT: v_pk_sub_i16 v1, v1, 7
1169 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
1170 ; GFX10-NEXT: s_endpgm
1171 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1172 %tid.ext = sext i32 %tid to i64
1173 %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
1174 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
1175 %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep
1176 %result = sub <2 x i16> %x, <i16 7, i16 0>
1177 store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out
1181 ; Can fold 0 and inline immediate in other half.
1182 define amdgpu_kernel void @v_test_v2i16_x_sub_0_16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
1183 ; SI-LABEL: v_test_v2i16_x_sub_0_16:
1185 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1186 ; SI-NEXT: s_mov_b32 s7, 0xf000
1187 ; SI-NEXT: s_mov_b32 s6, 0
1188 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1189 ; SI-NEXT: v_mov_b32_e32 v1, 0
1190 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1191 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
1192 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1193 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
1194 ; SI-NEXT: s_waitcnt vmcnt(0)
1195 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0xfff00000, v2
1196 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1199 ; VI-LABEL: v_test_v2i16_x_sub_0_16:
1201 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1202 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1203 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1204 ; VI-NEXT: v_mov_b32_e32 v1, s3
1205 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1206 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1207 ; VI-NEXT: flat_load_dword v0, v[0:1]
1208 ; VI-NEXT: v_mov_b32_e32 v1, -16
1209 ; VI-NEXT: v_mov_b32_e32 v3, s1
1210 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
1211 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1212 ; VI-NEXT: s_waitcnt vmcnt(0)
1213 ; VI-NEXT: v_add_u16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1214 ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1215 ; VI-NEXT: flat_store_dword v[2:3], v0
1218 ; GFX9-LABEL: v_test_v2i16_x_sub_0_16:
1220 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1221 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1222 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1223 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
1224 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1225 ; GFX9-NEXT: v_pk_sub_i16 v1, v1, 16 op_sel:[0,1] op_sel_hi:[1,0]
1226 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
1227 ; GFX9-NEXT: s_endpgm
1229 ; GFX10-LABEL: v_test_v2i16_x_sub_0_16:
1231 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1232 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1233 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1234 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
1235 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1236 ; GFX10-NEXT: v_pk_sub_i16 v1, v1, 16 op_sel:[0,1] op_sel_hi:[1,0]
1237 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
1238 ; GFX10-NEXT: s_endpgm
1239 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1240 %tid.ext = sext i32 %tid to i64
1241 %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
1242 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
1243 %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep
1244 %result = sub <2 x i16> %x, <i16 0, i16 16>
1245 store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out
1249 define amdgpu_kernel void @v_test_v2i16_x_sub_0_1_0(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
1250 ; SI-LABEL: v_test_v2i16_x_sub_0_1_0:
1252 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1253 ; SI-NEXT: s_mov_b32 s7, 0xf000
1254 ; SI-NEXT: s_mov_b32 s6, 0
1255 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1256 ; SI-NEXT: v_mov_b32_e32 v1, 0
1257 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1258 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
1259 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1260 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
1261 ; SI-NEXT: s_waitcnt vmcnt(0)
1262 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x3c000000, v2
1263 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1266 ; VI-LABEL: v_test_v2i16_x_sub_0_1_0:
1268 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1269 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1270 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1271 ; VI-NEXT: v_mov_b32_e32 v1, s3
1272 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1273 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1274 ; VI-NEXT: flat_load_dword v0, v[0:1]
1275 ; VI-NEXT: v_mov_b32_e32 v1, 0x3c00
1276 ; VI-NEXT: v_mov_b32_e32 v3, s1
1277 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
1278 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1279 ; VI-NEXT: s_waitcnt vmcnt(0)
1280 ; VI-NEXT: v_add_u16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1281 ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1282 ; VI-NEXT: flat_store_dword v[2:3], v0
1285 ; GFX9-LABEL: v_test_v2i16_x_sub_0_1_0:
1287 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1288 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1289 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1290 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
1291 ; GFX9-NEXT: s_brev_b32 s2, 35
1292 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1293 ; GFX9-NEXT: v_pk_sub_i16 v1, v1, s2
1294 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
1295 ; GFX9-NEXT: s_endpgm
1297 ; GFX10-LABEL: v_test_v2i16_x_sub_0_1_0:
1299 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1300 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1301 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1302 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
1303 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1304 ; GFX10-NEXT: v_pk_sub_i16 v1, v1, 0xc400 op_sel:[0,1] op_sel_hi:[1,0]
1305 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
1306 ; GFX10-NEXT: s_endpgm
1307 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1308 %tid.ext = sext i32 %tid to i64
1309 %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
1310 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
1311 %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep
1312 %result = sub <2 x i16> %x, <i16 0, i16 -15360>
1313 store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out
1317 define amdgpu_kernel void @v_test_v2i16_x_sub_0_neg1_0(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
1318 ; SI-LABEL: v_test_v2i16_x_sub_0_neg1_0:
1320 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1321 ; SI-NEXT: s_mov_b32 s7, 0xf000
1322 ; SI-NEXT: s_mov_b32 s6, 0
1323 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1324 ; SI-NEXT: v_mov_b32_e32 v1, 0
1325 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1326 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
1327 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1328 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
1329 ; SI-NEXT: s_waitcnt vmcnt(0)
1330 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0xbc000000, v2
1331 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1334 ; VI-LABEL: v_test_v2i16_x_sub_0_neg1_0:
1336 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1337 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1338 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1339 ; VI-NEXT: v_mov_b32_e32 v1, s3
1340 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1341 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1342 ; VI-NEXT: flat_load_dword v0, v[0:1]
1343 ; VI-NEXT: v_mov_b32_e32 v1, 0xffffbc00
1344 ; VI-NEXT: v_mov_b32_e32 v3, s1
1345 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
1346 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1347 ; VI-NEXT: s_waitcnt vmcnt(0)
1348 ; VI-NEXT: v_add_u16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1349 ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1350 ; VI-NEXT: flat_store_dword v[2:3], v0
1353 ; GFX9-LABEL: v_test_v2i16_x_sub_0_neg1_0:
1355 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1356 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1357 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1358 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
1359 ; GFX9-NEXT: s_brev_b32 s2, 34
1360 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1361 ; GFX9-NEXT: v_pk_sub_i16 v1, v1, s2
1362 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
1363 ; GFX9-NEXT: s_endpgm
1365 ; GFX10-LABEL: v_test_v2i16_x_sub_0_neg1_0:
1367 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1368 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1369 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1370 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
1371 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1372 ; GFX10-NEXT: v_pk_sub_i16 v1, v1, 0x4400 op_sel:[0,1] op_sel_hi:[1,0]
1373 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
1374 ; GFX10-NEXT: s_endpgm
1375 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1376 %tid.ext = sext i32 %tid to i64
1377 %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
1378 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
1379 %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep
1380 %result = sub <2 x i16> %x, <i16 0, i16 17408>
1381 store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out
1385 ; -32 isn't an inline immediate, but 32 is
1386 define amdgpu_kernel void @v_test_v2i16_x_add_neg32_neg32(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
1387 ; SI-LABEL: v_test_v2i16_x_add_neg32_neg32:
1389 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1390 ; SI-NEXT: s_mov_b32 s7, 0xf000
1391 ; SI-NEXT: s_mov_b32 s6, 0
1392 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1393 ; SI-NEXT: v_mov_b32_e32 v1, 0
1394 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1395 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
1396 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1397 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
1398 ; SI-NEXT: s_waitcnt vmcnt(0)
1399 ; SI-NEXT: v_subrev_i32_e32 v3, vcc, 32, v2
1400 ; SI-NEXT: s_mov_b32 s4, 0xffff0000
1401 ; SI-NEXT: v_bfi_b32 v2, s4, v2, v3
1402 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0xffe00000, v2
1403 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1406 ; VI-LABEL: v_test_v2i16_x_add_neg32_neg32:
1408 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1409 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1410 ; VI-NEXT: v_mov_b32_e32 v4, 32
1411 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1412 ; VI-NEXT: v_mov_b32_e32 v1, s3
1413 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1414 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1415 ; VI-NEXT: flat_load_dword v3, v[0:1]
1416 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
1417 ; VI-NEXT: v_mov_b32_e32 v1, s1
1418 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1419 ; VI-NEXT: s_waitcnt vmcnt(0)
1420 ; VI-NEXT: v_sub_u16_sdwa v2, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1421 ; VI-NEXT: v_subrev_u16_e32 v3, 32, v3
1422 ; VI-NEXT: v_or_b32_e32 v2, v3, v2
1423 ; VI-NEXT: flat_store_dword v[0:1], v2
1426 ; GFX9-LABEL: v_test_v2i16_x_add_neg32_neg32:
1428 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1429 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1430 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1431 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
1432 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1433 ; GFX9-NEXT: v_pk_sub_u16 v1, v1, 32 op_sel_hi:[1,0]
1434 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
1435 ; GFX9-NEXT: s_endpgm
1437 ; GFX10-LABEL: v_test_v2i16_x_add_neg32_neg32:
1439 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1440 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1441 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1442 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
1443 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1444 ; GFX10-NEXT: v_pk_sub_u16 v1, v1, 32 op_sel_hi:[1,0]
1445 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
1446 ; GFX10-NEXT: s_endpgm
1447 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1448 %tid.ext = sext i32 %tid to i64
1449 %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
1450 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
1451 %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep
1452 %result = add <2 x i16> %x, <i16 -32, i16 -32>
1453 store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out
1457 define amdgpu_kernel void @v_test_v2i16_x_add_0_neg32(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
1458 ; SI-LABEL: v_test_v2i16_x_add_0_neg32:
1460 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1461 ; SI-NEXT: s_mov_b32 s7, 0xf000
1462 ; SI-NEXT: s_mov_b32 s6, 0
1463 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1464 ; SI-NEXT: v_mov_b32_e32 v1, 0
1465 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1466 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
1467 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1468 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
1469 ; SI-NEXT: s_waitcnt vmcnt(0)
1470 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0xffe00000, v2
1471 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1474 ; VI-LABEL: v_test_v2i16_x_add_0_neg32:
1476 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1477 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1478 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1479 ; VI-NEXT: v_mov_b32_e32 v1, s3
1480 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1481 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1482 ; VI-NEXT: flat_load_dword v0, v[0:1]
1483 ; VI-NEXT: v_mov_b32_e32 v1, 32
1484 ; VI-NEXT: v_mov_b32_e32 v3, s1
1485 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
1486 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1487 ; VI-NEXT: s_waitcnt vmcnt(0)
1488 ; VI-NEXT: v_sub_u16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1489 ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1490 ; VI-NEXT: flat_store_dword v[2:3], v0
1493 ; GFX9-LABEL: v_test_v2i16_x_add_0_neg32:
1495 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1496 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1497 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1498 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
1499 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1500 ; GFX9-NEXT: v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0]
1501 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
1502 ; GFX9-NEXT: s_endpgm
1504 ; GFX10-LABEL: v_test_v2i16_x_add_0_neg32:
1506 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1507 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1508 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1509 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
1510 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1511 ; GFX10-NEXT: v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0]
1512 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
1513 ; GFX10-NEXT: s_endpgm
1514 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1515 %tid.ext = sext i32 %tid to i64
1516 %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
1517 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
1518 %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep
1519 %result = add <2 x i16> %x, <i16 0, i16 -32>
1520 store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out
1524 define amdgpu_kernel void @v_test_v2i16_x_add_neg32_0(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
1525 ; SI-LABEL: v_test_v2i16_x_add_neg32_0:
1527 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1528 ; SI-NEXT: s_mov_b32 s7, 0xf000
1529 ; SI-NEXT: s_mov_b32 s6, 0
1530 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1531 ; SI-NEXT: v_mov_b32_e32 v1, 0
1532 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1533 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
1534 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1535 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
1536 ; SI-NEXT: s_waitcnt vmcnt(0)
1537 ; SI-NEXT: v_subrev_i32_e32 v3, vcc, 32, v2
1538 ; SI-NEXT: s_mov_b32 s4, 0xffff
1539 ; SI-NEXT: v_bfi_b32 v2, s4, v3, v2
1540 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1543 ; VI-LABEL: v_test_v2i16_x_add_neg32_0:
1545 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1546 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1547 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1548 ; VI-NEXT: v_mov_b32_e32 v1, s3
1549 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1550 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1551 ; VI-NEXT: flat_load_dword v3, v[0:1]
1552 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
1553 ; VI-NEXT: v_mov_b32_e32 v1, s1
1554 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1555 ; VI-NEXT: s_waitcnt vmcnt(0)
1556 ; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
1557 ; VI-NEXT: v_subrev_u16_e32 v3, 32, v3
1558 ; VI-NEXT: v_or_b32_e32 v2, v3, v2
1559 ; VI-NEXT: flat_store_dword v[0:1], v2
1562 ; GFX9-LABEL: v_test_v2i16_x_add_neg32_0:
1564 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1565 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1566 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1567 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
1568 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1569 ; GFX9-NEXT: v_pk_sub_u16 v1, v1, 32
1570 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
1571 ; GFX9-NEXT: s_endpgm
1573 ; GFX10-LABEL: v_test_v2i16_x_add_neg32_0:
1575 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1576 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1577 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1578 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
1579 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1580 ; GFX10-NEXT: v_pk_sub_u16 v1, v1, 32
1581 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
1582 ; GFX10-NEXT: s_endpgm
1583 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1584 %tid.ext = sext i32 %tid to i64
1585 %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
1586 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
1587 %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep
1588 %result = add <2 x i16> %x, <i16 -32, i16 0>
1589 store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out
1593 ; 16 and -16 are both inline immediates
1594 define amdgpu_kernel void @v_test_v2i16_x_add_neg16_neg16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
1595 ; SI-LABEL: v_test_v2i16_x_add_neg16_neg16:
1597 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1598 ; SI-NEXT: s_mov_b32 s7, 0xf000
1599 ; SI-NEXT: s_mov_b32 s6, 0
1600 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1601 ; SI-NEXT: v_mov_b32_e32 v1, 0
1602 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1603 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
1604 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1605 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
1606 ; SI-NEXT: s_waitcnt vmcnt(0)
1607 ; SI-NEXT: v_add_i32_e32 v3, vcc, -16, v2
1608 ; SI-NEXT: s_mov_b32 s4, 0xffff0000
1609 ; SI-NEXT: v_bfi_b32 v2, s4, v2, v3
1610 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0xfff00000, v2
1611 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1614 ; VI-LABEL: v_test_v2i16_x_add_neg16_neg16:
1616 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1617 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1618 ; VI-NEXT: v_mov_b32_e32 v4, -16
1619 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1620 ; VI-NEXT: v_mov_b32_e32 v1, s3
1621 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1622 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1623 ; VI-NEXT: flat_load_dword v3, v[0:1]
1624 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
1625 ; VI-NEXT: v_mov_b32_e32 v1, s1
1626 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1627 ; VI-NEXT: s_waitcnt vmcnt(0)
1628 ; VI-NEXT: v_add_u16_e32 v2, -16, v3
1629 ; VI-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1630 ; VI-NEXT: v_or_b32_e32 v2, v2, v3
1631 ; VI-NEXT: flat_store_dword v[0:1], v2
1634 ; GFX9-LABEL: v_test_v2i16_x_add_neg16_neg16:
1636 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1637 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1638 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1639 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
1640 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1641 ; GFX9-NEXT: v_pk_sub_u16 v1, v1, 16 op_sel_hi:[1,0]
1642 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
1643 ; GFX9-NEXT: s_endpgm
1645 ; GFX10-LABEL: v_test_v2i16_x_add_neg16_neg16:
1647 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1648 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1649 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1650 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
1651 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1652 ; GFX10-NEXT: v_pk_sub_u16 v1, v1, 16 op_sel_hi:[1,0]
1653 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
1654 ; GFX10-NEXT: s_endpgm
1655 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1656 %tid.ext = sext i32 %tid to i64
1657 %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
1658 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
1659 %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep
1660 %result = add <2 x i16> %x, <i16 -16, i16 -16>
1661 store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out
1665 define amdgpu_kernel void @v_test_v2i16_x_add_0_neg16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
1666 ; SI-LABEL: v_test_v2i16_x_add_0_neg16:
1668 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1669 ; SI-NEXT: s_mov_b32 s7, 0xf000
1670 ; SI-NEXT: s_mov_b32 s6, 0
1671 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1672 ; SI-NEXT: v_mov_b32_e32 v1, 0
1673 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1674 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
1675 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1676 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
1677 ; SI-NEXT: s_waitcnt vmcnt(0)
1678 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0xfff00000, v2
1679 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1682 ; VI-LABEL: v_test_v2i16_x_add_0_neg16:
1684 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1685 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1686 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1687 ; VI-NEXT: v_mov_b32_e32 v1, s3
1688 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1689 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1690 ; VI-NEXT: flat_load_dword v0, v[0:1]
1691 ; VI-NEXT: v_mov_b32_e32 v1, -16
1692 ; VI-NEXT: v_mov_b32_e32 v3, s1
1693 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
1694 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1695 ; VI-NEXT: s_waitcnt vmcnt(0)
1696 ; VI-NEXT: v_add_u16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1697 ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1698 ; VI-NEXT: flat_store_dword v[2:3], v0
1701 ; GFX9-LABEL: v_test_v2i16_x_add_0_neg16:
1703 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1704 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1705 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1706 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
1707 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1708 ; GFX9-NEXT: v_pk_sub_u16 v1, v1, 16 op_sel:[0,1] op_sel_hi:[1,0]
1709 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
1710 ; GFX9-NEXT: s_endpgm
1712 ; GFX10-LABEL: v_test_v2i16_x_add_0_neg16:
1714 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1715 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1716 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1717 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
1718 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1719 ; GFX10-NEXT: v_pk_sub_u16 v1, v1, 16 op_sel:[0,1] op_sel_hi:[1,0]
1720 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
1721 ; GFX10-NEXT: s_endpgm
1722 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1723 %tid.ext = sext i32 %tid to i64
1724 %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
1725 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
1726 %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep
1727 %result = add <2 x i16> %x, <i16 0, i16 -16>
1728 store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out
1732 define amdgpu_kernel void @v_test_v2i16_x_add_neg16_0(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
1733 ; SI-LABEL: v_test_v2i16_x_add_neg16_0:
1735 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1736 ; SI-NEXT: s_mov_b32 s7, 0xf000
1737 ; SI-NEXT: s_mov_b32 s6, 0
1738 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1739 ; SI-NEXT: v_mov_b32_e32 v1, 0
1740 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1741 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
1742 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1743 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
1744 ; SI-NEXT: s_waitcnt vmcnt(0)
1745 ; SI-NEXT: v_add_i32_e32 v3, vcc, -16, v2
1746 ; SI-NEXT: s_mov_b32 s4, 0xffff
1747 ; SI-NEXT: v_bfi_b32 v2, s4, v3, v2
1748 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1751 ; VI-LABEL: v_test_v2i16_x_add_neg16_0:
1753 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1754 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1755 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1756 ; VI-NEXT: v_mov_b32_e32 v1, s3
1757 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1758 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1759 ; VI-NEXT: flat_load_dword v3, v[0:1]
1760 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
1761 ; VI-NEXT: v_mov_b32_e32 v1, s1
1762 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1763 ; VI-NEXT: s_waitcnt vmcnt(0)
1764 ; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
1765 ; VI-NEXT: v_add_u16_e32 v3, -16, v3
1766 ; VI-NEXT: v_or_b32_e32 v2, v3, v2
1767 ; VI-NEXT: flat_store_dword v[0:1], v2
1770 ; GFX9-LABEL: v_test_v2i16_x_add_neg16_0:
1772 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1773 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1774 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1775 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
1776 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1777 ; GFX9-NEXT: v_pk_sub_u16 v1, v1, 16
1778 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
1779 ; GFX9-NEXT: s_endpgm
1781 ; GFX10-LABEL: v_test_v2i16_x_add_neg16_0:
1783 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1784 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1785 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1786 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
1787 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1788 ; GFX10-NEXT: v_pk_sub_u16 v1, v1, 16
1789 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
1790 ; GFX10-NEXT: s_endpgm
1791 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1792 %tid.ext = sext i32 %tid to i64
1793 %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
1794 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
1795 %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep
1796 %result = add <2 x i16> %x, <i16 -16, i16 0>
1797 store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out
1801 define amdgpu_kernel void @v_test_v2i16_x_add_neg_fpone(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
1802 ; SI-LABEL: v_test_v2i16_x_add_neg_fpone:
1804 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1805 ; SI-NEXT: s_mov_b32 s7, 0xf000
1806 ; SI-NEXT: s_mov_b32 s6, 0
1807 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1808 ; SI-NEXT: v_mov_b32_e32 v1, 0
1809 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1810 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
1811 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1812 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
1813 ; SI-NEXT: s_waitcnt vmcnt(0)
1814 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0xffffc400, v2
1815 ; SI-NEXT: s_mov_b32 s4, 0xffff0000
1816 ; SI-NEXT: v_bfi_b32 v2, s4, v2, v3
1817 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0xc4000000, v2
1818 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1821 ; VI-LABEL: v_test_v2i16_x_add_neg_fpone:
1823 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1824 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1825 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1826 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1827 ; VI-NEXT: v_mov_b32_e32 v1, s3
1828 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1829 ; VI-NEXT: flat_load_dword v3, v[0:1]
1830 ; VI-NEXT: s_movk_i32 s2, 0xc400
1831 ; VI-NEXT: v_mov_b32_e32 v4, s2
1832 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
1833 ; VI-NEXT: v_mov_b32_e32 v1, s1
1834 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1835 ; VI-NEXT: s_waitcnt vmcnt(0)
1836 ; VI-NEXT: v_add_u16_e32 v2, s2, v3
1837 ; VI-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1838 ; VI-NEXT: v_or_b32_e32 v2, v2, v3
1839 ; VI-NEXT: flat_store_dword v[0:1], v2
1842 ; GFX9-LABEL: v_test_v2i16_x_add_neg_fpone:
1844 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1845 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1846 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1847 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
1848 ; GFX9-NEXT: s_mov_b32 s2, 0x3c003c00
1849 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1850 ; GFX9-NEXT: v_pk_sub_u16 v1, v1, s2
1851 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
1852 ; GFX9-NEXT: s_endpgm
1854 ; GFX10-LABEL: v_test_v2i16_x_add_neg_fpone:
1856 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1857 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1858 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1859 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
1860 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1861 ; GFX10-NEXT: v_pk_sub_u16 v1, v1, 0x3c00 op_sel_hi:[1,0]
1862 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
1863 ; GFX10-NEXT: s_endpgm
1864 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1865 %tid.ext = sext i32 %tid to i64
1866 %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
1867 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
1868 %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep
1869 %result = add <2 x i16> %x, <i16 -15360, i16 -15360>
1870 store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out
1874 define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfpone(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
1875 ; SI-LABEL: v_test_v2i16_x_add_neg_negfpone:
1877 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1878 ; SI-NEXT: s_mov_b32 s7, 0xf000
1879 ; SI-NEXT: s_mov_b32 s6, 0
1880 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1881 ; SI-NEXT: v_mov_b32_e32 v1, 0
1882 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1883 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
1884 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1885 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
1886 ; SI-NEXT: s_waitcnt vmcnt(0)
1887 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4400, v2
1888 ; SI-NEXT: s_mov_b32 s4, 0xffff0000
1889 ; SI-NEXT: v_bfi_b32 v2, s4, v2, v3
1890 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44000000, v2
1891 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1894 ; VI-LABEL: v_test_v2i16_x_add_neg_negfpone:
1896 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1897 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1898 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1899 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1900 ; VI-NEXT: v_mov_b32_e32 v1, s3
1901 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1902 ; VI-NEXT: flat_load_dword v3, v[0:1]
1903 ; VI-NEXT: s_movk_i32 s2, 0x4400
1904 ; VI-NEXT: v_mov_b32_e32 v4, s2
1905 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
1906 ; VI-NEXT: v_mov_b32_e32 v1, s1
1907 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1908 ; VI-NEXT: s_waitcnt vmcnt(0)
1909 ; VI-NEXT: v_add_u16_e32 v2, s2, v3
1910 ; VI-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1911 ; VI-NEXT: v_or_b32_e32 v2, v2, v3
1912 ; VI-NEXT: flat_store_dword v[0:1], v2
1915 ; GFX9-LABEL: v_test_v2i16_x_add_neg_negfpone:
1917 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1918 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1919 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1920 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
1921 ; GFX9-NEXT: s_mov_b32 s2, 0xbc00bc00
1922 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1923 ; GFX9-NEXT: v_pk_sub_u16 v1, v1, s2
1924 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
1925 ; GFX9-NEXT: s_endpgm
1927 ; GFX10-LABEL: v_test_v2i16_x_add_neg_negfpone:
1929 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1930 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1931 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1932 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
1933 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1934 ; GFX10-NEXT: v_pk_sub_u16 v1, v1, 0xbc00 op_sel_hi:[1,0]
1935 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
1936 ; GFX10-NEXT: s_endpgm
1937 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1938 %tid.ext = sext i32 %tid to i64
1939 %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
1940 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
1941 %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep
1942 %result = add <2 x i16> %x, <i16 17408, i16 17408>
1943 store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out
1947 define amdgpu_kernel void @v_test_v2i16_x_add_neg_fptwo(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
1948 ; SI-LABEL: v_test_v2i16_x_add_neg_fptwo:
1950 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1951 ; SI-NEXT: s_mov_b32 s7, 0xf000
1952 ; SI-NEXT: s_mov_b32 s6, 0
1953 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1954 ; SI-NEXT: v_mov_b32_e32 v1, 0
1955 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1956 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
1957 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1958 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
1959 ; SI-NEXT: s_waitcnt vmcnt(0)
1960 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4000, v2
1961 ; SI-NEXT: s_mov_b32 s4, 0xffff0000
1962 ; SI-NEXT: v_bfi_b32 v2, s4, v2, v3
1963 ; SI-NEXT: v_add_i32_e32 v2, vcc, 2.0, v2
1964 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1967 ; VI-LABEL: v_test_v2i16_x_add_neg_fptwo:
1969 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1970 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1971 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1972 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1973 ; VI-NEXT: v_mov_b32_e32 v1, s3
1974 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1975 ; VI-NEXT: flat_load_dword v3, v[0:1]
1976 ; VI-NEXT: s_movk_i32 s2, 0x4000
1977 ; VI-NEXT: v_mov_b32_e32 v4, s2
1978 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
1979 ; VI-NEXT: v_mov_b32_e32 v1, s1
1980 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1981 ; VI-NEXT: s_waitcnt vmcnt(0)
1982 ; VI-NEXT: v_add_u16_e32 v2, s2, v3
1983 ; VI-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1984 ; VI-NEXT: v_or_b32_e32 v2, v2, v3
1985 ; VI-NEXT: flat_store_dword v[0:1], v2
1988 ; GFX9-LABEL: v_test_v2i16_x_add_neg_fptwo:
1990 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1991 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1992 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1993 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
1994 ; GFX9-NEXT: s_mov_b32 s2, 0xc000c000
1995 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1996 ; GFX9-NEXT: v_pk_sub_u16 v1, v1, s2
1997 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
1998 ; GFX9-NEXT: s_endpgm
2000 ; GFX10-LABEL: v_test_v2i16_x_add_neg_fptwo:
2002 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
2003 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2004 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
2005 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
2006 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2007 ; GFX10-NEXT: v_pk_sub_u16 v1, v1, 0xc000 op_sel_hi:[1,0]
2008 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
2009 ; GFX10-NEXT: s_endpgm
2010 %tid = call i32 @llvm.amdgcn.workitem.id.x()
2011 %tid.ext = sext i32 %tid to i64
2012 %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
2013 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
2014 %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep
2015 %result = add <2 x i16> %x, <i16 16384, i16 16384>
2016 store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out
2020 define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfptwo(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
2021 ; SI-LABEL: v_test_v2i16_x_add_neg_negfptwo:
2023 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
2024 ; SI-NEXT: s_mov_b32 s7, 0xf000
2025 ; SI-NEXT: s_mov_b32 s6, 0
2026 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2027 ; SI-NEXT: v_mov_b32_e32 v1, 0
2028 ; SI-NEXT: s_waitcnt lgkmcnt(0)
2029 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
2030 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
2031 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
2032 ; SI-NEXT: s_waitcnt vmcnt(0)
2033 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0xffffc000, v2
2034 ; SI-NEXT: s_mov_b32 s4, 0xffff0000
2035 ; SI-NEXT: v_bfi_b32 v2, s4, v2, v3
2036 ; SI-NEXT: v_add_i32_e32 v2, vcc, -2.0, v2
2037 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
2040 ; VI-LABEL: v_test_v2i16_x_add_neg_negfptwo:
2042 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
2043 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
2044 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2045 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
2046 ; VI-NEXT: v_mov_b32_e32 v1, s3
2047 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2048 ; VI-NEXT: flat_load_dword v3, v[0:1]
2049 ; VI-NEXT: s_movk_i32 s2, 0xc000
2050 ; VI-NEXT: v_mov_b32_e32 v4, s2
2051 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
2052 ; VI-NEXT: v_mov_b32_e32 v1, s1
2053 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2054 ; VI-NEXT: s_waitcnt vmcnt(0)
2055 ; VI-NEXT: v_add_u16_e32 v2, s2, v3
2056 ; VI-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
2057 ; VI-NEXT: v_or_b32_e32 v2, v2, v3
2058 ; VI-NEXT: flat_store_dword v[0:1], v2
2061 ; GFX9-LABEL: v_test_v2i16_x_add_neg_negfptwo:
2063 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
2064 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2065 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2066 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
2067 ; GFX9-NEXT: s_mov_b32 s2, 0x40004000
2068 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2069 ; GFX9-NEXT: v_pk_sub_u16 v1, v1, s2
2070 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
2071 ; GFX9-NEXT: s_endpgm
2073 ; GFX10-LABEL: v_test_v2i16_x_add_neg_negfptwo:
2075 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
2076 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2077 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
2078 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
2079 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2080 ; GFX10-NEXT: v_pk_sub_u16 v1, v1, 0x4000 op_sel_hi:[1,0]
2081 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
2082 ; GFX10-NEXT: s_endpgm
2083 %tid = call i32 @llvm.amdgcn.workitem.id.x()
2084 %tid.ext = sext i32 %tid to i64
2085 %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
2086 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
2087 %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep
2088 %result = add <2 x i16> %x, <i16 -16384, i16 -16384>
2089 store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out
2093 define amdgpu_kernel void @v_test_v2i16_x_add_undef_neg32(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
2094 ; SI-LABEL: v_test_v2i16_x_add_undef_neg32:
2096 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
2097 ; SI-NEXT: s_mov_b32 s7, 0xf000
2098 ; SI-NEXT: s_mov_b32 s6, 0
2099 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2100 ; SI-NEXT: v_mov_b32_e32 v1, 0
2101 ; SI-NEXT: s_waitcnt lgkmcnt(0)
2102 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
2103 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
2104 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
2105 ; SI-NEXT: s_waitcnt vmcnt(0)
2106 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
2107 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0xffe00000, v2
2108 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
2111 ; VI-LABEL: v_test_v2i16_x_add_undef_neg32:
2113 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
2114 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
2115 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2116 ; VI-NEXT: v_mov_b32_e32 v1, s3
2117 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
2118 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2119 ; VI-NEXT: flat_load_dword v0, v[0:1]
2120 ; VI-NEXT: v_mov_b32_e32 v1, 32
2121 ; VI-NEXT: v_mov_b32_e32 v3, s1
2122 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
2123 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
2124 ; VI-NEXT: s_waitcnt vmcnt(0)
2125 ; VI-NEXT: v_sub_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
2126 ; VI-NEXT: flat_store_dword v[2:3], v0
2129 ; GFX9-LABEL: v_test_v2i16_x_add_undef_neg32:
2131 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
2132 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2133 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2134 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
2135 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2136 ; GFX9-NEXT: v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0]
2137 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
2138 ; GFX9-NEXT: s_endpgm
2140 ; GFX10-LABEL: v_test_v2i16_x_add_undef_neg32:
2142 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
2143 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2144 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
2145 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
2146 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2147 ; GFX10-NEXT: v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0]
2148 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
2149 ; GFX10-NEXT: s_endpgm
2150 %tid = call i32 @llvm.amdgcn.workitem.id.x()
2151 %tid.ext = sext i32 %tid to i64
2152 %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
2153 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
2154 %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep
2155 %result = add <2 x i16> %x, <i16 undef, i16 -32>
2156 store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out
2160 define amdgpu_kernel void @v_test_v2i16_x_add_neg32_undef(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
2161 ; SI-LABEL: v_test_v2i16_x_add_neg32_undef:
2163 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
2164 ; SI-NEXT: s_mov_b32 s7, 0xf000
2165 ; SI-NEXT: s_mov_b32 s6, 0
2166 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2167 ; SI-NEXT: v_mov_b32_e32 v1, 0
2168 ; SI-NEXT: s_waitcnt lgkmcnt(0)
2169 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
2170 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
2171 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
2172 ; SI-NEXT: s_waitcnt vmcnt(0)
2173 ; SI-NEXT: v_subrev_i32_e32 v2, vcc, 32, v2
2174 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
2175 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
2178 ; VI-LABEL: v_test_v2i16_x_add_neg32_undef:
2180 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
2181 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
2182 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2183 ; VI-NEXT: v_mov_b32_e32 v1, s3
2184 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
2185 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2186 ; VI-NEXT: flat_load_dword v0, v[0:1]
2187 ; VI-NEXT: v_mov_b32_e32 v3, s1
2188 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
2189 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
2190 ; VI-NEXT: s_waitcnt vmcnt(0)
2191 ; VI-NEXT: v_subrev_u16_e32 v0, 32, v0
2192 ; VI-NEXT: flat_store_dword v[2:3], v0
2195 ; GFX9-LABEL: v_test_v2i16_x_add_neg32_undef:
2197 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
2198 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2199 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2200 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
2201 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2202 ; GFX9-NEXT: v_pk_sub_u16 v1, v1, 32
2203 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
2204 ; GFX9-NEXT: s_endpgm
2206 ; GFX10-LABEL: v_test_v2i16_x_add_neg32_undef:
2208 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
2209 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2210 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
2211 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
2212 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2213 ; GFX10-NEXT: v_pk_sub_u16 v1, v1, 32
2214 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
2215 ; GFX10-NEXT: s_endpgm
2216 %tid = call i32 @llvm.amdgcn.workitem.id.x()
2217 %tid.ext = sext i32 %tid to i64
2218 %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
2219 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
2220 %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep
2221 %result = add <2 x i16> %x, <i16 -32, i16 undef>
2222 store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out
2226 declare i32 @llvm.amdgcn.workitem.id.x() #1
2228 attributes #0 = { nounwind }
2229 attributes #1 = { nounwind readnone }