1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,SI
3 ; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,VI
4 ; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,GFX9
6 ; Test that add/sub with a constant is swapped to sub/add with negated
7 ; constant to minimize code size.
9 define amdgpu_kernel void @v_test_i32_x_sub_64(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
10 ; SI-LABEL: v_test_i32_x_sub_64:
12 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
13 ; SI-NEXT: s_mov_b32 s7, 0xf000
14 ; SI-NEXT: s_mov_b32 s6, 0
15 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
16 ; SI-NEXT: v_mov_b32_e32 v1, 0
17 ; SI-NEXT: s_waitcnt lgkmcnt(0)
18 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
19 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
20 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
21 ; SI-NEXT: s_waitcnt vmcnt(0)
22 ; SI-NEXT: v_subrev_i32_e32 v2, vcc, 64, v2
23 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
26 ; VI-LABEL: v_test_i32_x_sub_64:
28 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
29 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
30 ; VI-NEXT: s_waitcnt lgkmcnt(0)
31 ; VI-NEXT: v_mov_b32_e32 v1, s3
32 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
33 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
34 ; VI-NEXT: flat_load_dword v3, v[0:1]
35 ; VI-NEXT: v_mov_b32_e32 v1, s1
36 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
37 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
38 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
39 ; VI-NEXT: v_subrev_u32_e32 v2, vcc, 64, v3
40 ; VI-NEXT: flat_store_dword v[0:1], v2
43 ; GFX9-LABEL: v_test_i32_x_sub_64:
45 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
46 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
47 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
48 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
49 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
50 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
51 ; GFX9-NEXT: global_load_dword v3, v[0:1], off
52 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
53 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
54 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
55 ; GFX9-NEXT: s_waitcnt vmcnt(0)
56 ; GFX9-NEXT: v_subrev_u32_e32 v2, 64, v3
57 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
59 %tid = call i32 @llvm.amdgcn.workitem.id.x()
60 %tid.ext = sext i32 %tid to i64
61 %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
62 %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
63 %x = load i32, i32 addrspace(1)* %gep
64 %result = sub i32 %x, 64
65 store i32 %result, i32 addrspace(1)* %gep.out
69 define amdgpu_kernel void @v_test_i32_x_sub_64_multi_use(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
70 ; SI-LABEL: v_test_i32_x_sub_64_multi_use:
72 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
73 ; SI-NEXT: s_mov_b32 s7, 0xf000
74 ; SI-NEXT: s_mov_b32 s6, 0
75 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
76 ; SI-NEXT: v_mov_b32_e32 v1, 0
77 ; SI-NEXT: s_waitcnt lgkmcnt(0)
78 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
79 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
80 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
81 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
82 ; SI-NEXT: s_waitcnt vmcnt(1)
83 ; SI-NEXT: v_subrev_i32_e32 v2, vcc, 64, v2
84 ; SI-NEXT: s_waitcnt vmcnt(0)
85 ; SI-NEXT: v_subrev_i32_e32 v3, vcc, 64, v3
86 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
87 ; SI-NEXT: buffer_store_dword v3, v[0:1], s[0:3], 0 addr64
90 ; VI-LABEL: v_test_i32_x_sub_64_multi_use:
92 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
93 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
94 ; VI-NEXT: s_waitcnt lgkmcnt(0)
95 ; VI-NEXT: v_mov_b32_e32 v1, s3
96 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
97 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
98 ; VI-NEXT: flat_load_dword v4, v[0:1]
99 ; VI-NEXT: flat_load_dword v0, v[0:1]
100 ; VI-NEXT: v_mov_b32_e32 v3, s1
101 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
102 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
103 ; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
104 ; VI-NEXT: v_subrev_u32_e32 v1, vcc, 64, v4
105 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
106 ; VI-NEXT: v_subrev_u32_e32 v0, vcc, 64, v0
107 ; VI-NEXT: flat_store_dword v[2:3], v1
108 ; VI-NEXT: flat_store_dword v[2:3], v0
111 ; GFX9-LABEL: v_test_i32_x_sub_64_multi_use:
113 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
114 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
115 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
116 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
117 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
118 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
119 ; GFX9-NEXT: global_load_dword v4, v[0:1], off
120 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
121 ; GFX9-NEXT: v_mov_b32_e32 v3, s1
122 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
123 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
124 ; GFX9-NEXT: s_waitcnt vmcnt(1)
125 ; GFX9-NEXT: v_subrev_u32_e32 v1, 64, v4
126 ; GFX9-NEXT: s_waitcnt vmcnt(0)
127 ; GFX9-NEXT: v_subrev_u32_e32 v0, 64, v0
128 ; GFX9-NEXT: global_store_dword v[2:3], v1, off
129 ; GFX9-NEXT: global_store_dword v[2:3], v0, off
130 ; GFX9-NEXT: s_endpgm
131 %tid = call i32 @llvm.amdgcn.workitem.id.x()
132 %tid.ext = sext i32 %tid to i64
133 %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
134 %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
135 %x = load volatile i32, i32 addrspace(1)* %gep
136 %y = load volatile i32, i32 addrspace(1)* %gep
137 %result0 = sub i32 %x, 64
138 %result1 = sub i32 %y, 64
139 store volatile i32 %result0, i32 addrspace(1)* %gep.out
140 store volatile i32 %result1, i32 addrspace(1)* %gep.out
144 define amdgpu_kernel void @v_test_i32_64_sub_x(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
145 ; SI-LABEL: v_test_i32_64_sub_x:
147 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
148 ; SI-NEXT: s_mov_b32 s7, 0xf000
149 ; SI-NEXT: s_mov_b32 s6, 0
150 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
151 ; SI-NEXT: v_mov_b32_e32 v1, 0
152 ; SI-NEXT: s_waitcnt lgkmcnt(0)
153 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
154 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
155 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
156 ; SI-NEXT: s_waitcnt vmcnt(0)
157 ; SI-NEXT: v_sub_i32_e32 v2, vcc, 64, v2
158 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
161 ; VI-LABEL: v_test_i32_64_sub_x:
163 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
164 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
165 ; VI-NEXT: s_waitcnt lgkmcnt(0)
166 ; VI-NEXT: v_mov_b32_e32 v1, s3
167 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
168 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
169 ; VI-NEXT: flat_load_dword v3, v[0:1]
170 ; VI-NEXT: v_mov_b32_e32 v1, s1
171 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
172 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
173 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
174 ; VI-NEXT: v_sub_u32_e32 v2, vcc, 64, v3
175 ; VI-NEXT: flat_store_dword v[0:1], v2
178 ; GFX9-LABEL: v_test_i32_64_sub_x:
180 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
181 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
182 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
183 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
184 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
185 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
186 ; GFX9-NEXT: global_load_dword v3, v[0:1], off
187 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
188 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
189 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
190 ; GFX9-NEXT: s_waitcnt vmcnt(0)
191 ; GFX9-NEXT: v_sub_u32_e32 v2, 64, v3
192 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
193 ; GFX9-NEXT: s_endpgm
194 %tid = call i32 @llvm.amdgcn.workitem.id.x()
195 %tid.ext = sext i32 %tid to i64
196 %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
197 %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
198 %x = load i32, i32 addrspace(1)* %gep
199 %result = sub i32 64, %x
200 store i32 %result, i32 addrspace(1)* %gep.out
204 define amdgpu_kernel void @v_test_i32_x_sub_65(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
205 ; SI-LABEL: v_test_i32_x_sub_65:
207 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
208 ; SI-NEXT: s_mov_b32 s7, 0xf000
209 ; SI-NEXT: s_mov_b32 s6, 0
210 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
211 ; SI-NEXT: v_mov_b32_e32 v1, 0
212 ; SI-NEXT: s_waitcnt lgkmcnt(0)
213 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
214 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
215 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
216 ; SI-NEXT: s_waitcnt vmcnt(0)
217 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0xffffffbf, v2
218 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
221 ; VI-LABEL: v_test_i32_x_sub_65:
223 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
224 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
225 ; VI-NEXT: s_waitcnt lgkmcnt(0)
226 ; VI-NEXT: v_mov_b32_e32 v1, s3
227 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
228 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
229 ; VI-NEXT: flat_load_dword v3, v[0:1]
230 ; VI-NEXT: v_mov_b32_e32 v1, s1
231 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
232 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
233 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
234 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0xffffffbf, v3
235 ; VI-NEXT: flat_store_dword v[0:1], v2
238 ; GFX9-LABEL: v_test_i32_x_sub_65:
240 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
241 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
242 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
243 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
244 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
245 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
246 ; GFX9-NEXT: global_load_dword v3, v[0:1], off
247 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
248 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
249 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
250 ; GFX9-NEXT: s_waitcnt vmcnt(0)
251 ; GFX9-NEXT: v_add_u32_e32 v2, 0xffffffbf, v3
252 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
253 ; GFX9-NEXT: s_endpgm
254 %tid = call i32 @llvm.amdgcn.workitem.id.x()
255 %tid.ext = sext i32 %tid to i64
256 %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
257 %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
258 %x = load i32, i32 addrspace(1)* %gep
259 %result = sub i32 %x, 65
260 store i32 %result, i32 addrspace(1)* %gep.out
264 define amdgpu_kernel void @v_test_i32_65_sub_x(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
265 ; SI-LABEL: v_test_i32_65_sub_x:
267 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
268 ; SI-NEXT: s_mov_b32 s7, 0xf000
269 ; SI-NEXT: s_mov_b32 s6, 0
270 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
271 ; SI-NEXT: v_mov_b32_e32 v1, 0
272 ; SI-NEXT: s_waitcnt lgkmcnt(0)
273 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
274 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
275 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
276 ; SI-NEXT: s_waitcnt vmcnt(0)
277 ; SI-NEXT: v_sub_i32_e32 v2, vcc, 0x41, v2
278 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
281 ; VI-LABEL: v_test_i32_65_sub_x:
283 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
284 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
285 ; VI-NEXT: s_waitcnt lgkmcnt(0)
286 ; VI-NEXT: v_mov_b32_e32 v1, s3
287 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
288 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
289 ; VI-NEXT: flat_load_dword v3, v[0:1]
290 ; VI-NEXT: v_mov_b32_e32 v1, s1
291 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
292 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
293 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
294 ; VI-NEXT: v_sub_u32_e32 v2, vcc, 0x41, v3
295 ; VI-NEXT: flat_store_dword v[0:1], v2
298 ; GFX9-LABEL: v_test_i32_65_sub_x:
300 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
301 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
302 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
303 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
304 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
305 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
306 ; GFX9-NEXT: global_load_dword v3, v[0:1], off
307 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
308 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
309 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
310 ; GFX9-NEXT: s_waitcnt vmcnt(0)
311 ; GFX9-NEXT: v_sub_u32_e32 v2, 0x41, v3
312 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
313 ; GFX9-NEXT: s_endpgm
314 %tid = call i32 @llvm.amdgcn.workitem.id.x()
315 %tid.ext = sext i32 %tid to i64
316 %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
317 %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
318 %x = load i32, i32 addrspace(1)* %gep
319 %result = sub i32 65, %x
320 store i32 %result, i32 addrspace(1)* %gep.out
324 define amdgpu_kernel void @v_test_i32_x_sub_neg16(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
325 ; SI-LABEL: v_test_i32_x_sub_neg16:
327 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
328 ; SI-NEXT: s_mov_b32 s7, 0xf000
329 ; SI-NEXT: s_mov_b32 s6, 0
330 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
331 ; SI-NEXT: v_mov_b32_e32 v1, 0
332 ; SI-NEXT: s_waitcnt lgkmcnt(0)
333 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
334 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
335 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
336 ; SI-NEXT: s_waitcnt vmcnt(0)
337 ; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v2
338 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
341 ; VI-LABEL: v_test_i32_x_sub_neg16:
343 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
344 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
345 ; VI-NEXT: s_waitcnt lgkmcnt(0)
346 ; VI-NEXT: v_mov_b32_e32 v1, s3
347 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
348 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
349 ; VI-NEXT: flat_load_dword v3, v[0:1]
350 ; VI-NEXT: v_mov_b32_e32 v1, s1
351 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
352 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
353 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
354 ; VI-NEXT: v_add_u32_e32 v2, vcc, 16, v3
355 ; VI-NEXT: flat_store_dword v[0:1], v2
358 ; GFX9-LABEL: v_test_i32_x_sub_neg16:
360 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
361 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
362 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
363 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
364 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
365 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
366 ; GFX9-NEXT: global_load_dword v3, v[0:1], off
367 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
368 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
369 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
370 ; GFX9-NEXT: s_waitcnt vmcnt(0)
371 ; GFX9-NEXT: v_add_u32_e32 v2, 16, v3
372 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
373 ; GFX9-NEXT: s_endpgm
374 %tid = call i32 @llvm.amdgcn.workitem.id.x()
375 %tid.ext = sext i32 %tid to i64
376 %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
377 %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
378 %x = load i32, i32 addrspace(1)* %gep
379 %result = sub i32 %x, -16
380 store i32 %result, i32 addrspace(1)* %gep.out
384 define amdgpu_kernel void @v_test_i32_neg16_sub_x(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
385 ; SI-LABEL: v_test_i32_neg16_sub_x:
387 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
388 ; SI-NEXT: s_mov_b32 s7, 0xf000
389 ; SI-NEXT: s_mov_b32 s6, 0
390 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
391 ; SI-NEXT: v_mov_b32_e32 v1, 0
392 ; SI-NEXT: s_waitcnt lgkmcnt(0)
393 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
394 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
395 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
396 ; SI-NEXT: s_waitcnt vmcnt(0)
397 ; SI-NEXT: v_sub_i32_e32 v2, vcc, -16, v2
398 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
401 ; VI-LABEL: v_test_i32_neg16_sub_x:
403 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
404 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
405 ; VI-NEXT: s_waitcnt lgkmcnt(0)
406 ; VI-NEXT: v_mov_b32_e32 v1, s3
407 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
408 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
409 ; VI-NEXT: flat_load_dword v3, v[0:1]
410 ; VI-NEXT: v_mov_b32_e32 v1, s1
411 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
412 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
413 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
414 ; VI-NEXT: v_sub_u32_e32 v2, vcc, -16, v3
415 ; VI-NEXT: flat_store_dword v[0:1], v2
418 ; GFX9-LABEL: v_test_i32_neg16_sub_x:
420 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
421 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
422 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
423 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
424 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
425 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
426 ; GFX9-NEXT: global_load_dword v3, v[0:1], off
427 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
428 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
429 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
430 ; GFX9-NEXT: s_waitcnt vmcnt(0)
431 ; GFX9-NEXT: v_sub_u32_e32 v2, -16, v3
432 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
433 ; GFX9-NEXT: s_endpgm
434 %tid = call i32 @llvm.amdgcn.workitem.id.x()
435 %tid.ext = sext i32 %tid to i64
436 %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
437 %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
438 %x = load i32, i32 addrspace(1)* %gep
439 %result = sub i32 -16, %x
440 store i32 %result, i32 addrspace(1)* %gep.out
444 define amdgpu_kernel void @v_test_i32_x_sub_neg17(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
445 ; SI-LABEL: v_test_i32_x_sub_neg17:
447 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
448 ; SI-NEXT: s_mov_b32 s7, 0xf000
449 ; SI-NEXT: s_mov_b32 s6, 0
450 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
451 ; SI-NEXT: v_mov_b32_e32 v1, 0
452 ; SI-NEXT: s_waitcnt lgkmcnt(0)
453 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
454 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
455 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
456 ; SI-NEXT: s_waitcnt vmcnt(0)
457 ; SI-NEXT: v_add_i32_e32 v2, vcc, 17, v2
458 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
461 ; VI-LABEL: v_test_i32_x_sub_neg17:
463 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
464 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
465 ; VI-NEXT: s_waitcnt lgkmcnt(0)
466 ; VI-NEXT: v_mov_b32_e32 v1, s3
467 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
468 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
469 ; VI-NEXT: flat_load_dword v3, v[0:1]
470 ; VI-NEXT: v_mov_b32_e32 v1, s1
471 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
472 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
473 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
474 ; VI-NEXT: v_add_u32_e32 v2, vcc, 17, v3
475 ; VI-NEXT: flat_store_dword v[0:1], v2
478 ; GFX9-LABEL: v_test_i32_x_sub_neg17:
480 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
481 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
482 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
483 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
484 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
485 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
486 ; GFX9-NEXT: global_load_dword v3, v[0:1], off
487 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
488 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
489 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
490 ; GFX9-NEXT: s_waitcnt vmcnt(0)
491 ; GFX9-NEXT: v_add_u32_e32 v2, 17, v3
492 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
493 ; GFX9-NEXT: s_endpgm
494 %tid = call i32 @llvm.amdgcn.workitem.id.x()
495 %tid.ext = sext i32 %tid to i64
496 %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
497 %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
498 %x = load i32, i32 addrspace(1)* %gep
499 %result = sub i32 %x, -17
500 store i32 %result, i32 addrspace(1)* %gep.out
504 define amdgpu_kernel void @v_test_i32_neg17_sub_x(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
505 ; SI-LABEL: v_test_i32_neg17_sub_x:
507 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
508 ; SI-NEXT: s_mov_b32 s7, 0xf000
509 ; SI-NEXT: s_mov_b32 s6, 0
510 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
511 ; SI-NEXT: v_mov_b32_e32 v1, 0
512 ; SI-NEXT: s_waitcnt lgkmcnt(0)
513 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
514 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
515 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
516 ; SI-NEXT: s_waitcnt vmcnt(0)
517 ; SI-NEXT: v_sub_i32_e32 v2, vcc, 0xffffffef, v2
518 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
521 ; VI-LABEL: v_test_i32_neg17_sub_x:
523 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
524 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
525 ; VI-NEXT: s_waitcnt lgkmcnt(0)
526 ; VI-NEXT: v_mov_b32_e32 v1, s3
527 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
528 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
529 ; VI-NEXT: flat_load_dword v3, v[0:1]
530 ; VI-NEXT: v_mov_b32_e32 v1, s1
531 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
532 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
533 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
534 ; VI-NEXT: v_sub_u32_e32 v2, vcc, 0xffffffef, v3
535 ; VI-NEXT: flat_store_dword v[0:1], v2
538 ; GFX9-LABEL: v_test_i32_neg17_sub_x:
540 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
541 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
542 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
543 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
544 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
545 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
546 ; GFX9-NEXT: global_load_dword v3, v[0:1], off
547 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
548 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
549 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
550 ; GFX9-NEXT: s_waitcnt vmcnt(0)
551 ; GFX9-NEXT: v_sub_u32_e32 v2, 0xffffffef, v3
552 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
553 ; GFX9-NEXT: s_endpgm
554 %tid = call i32 @llvm.amdgcn.workitem.id.x()
555 %tid.ext = sext i32 %tid to i64
556 %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
557 %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
558 %x = load i32, i32 addrspace(1)* %gep
559 %result = sub i32 -17, %x
560 store i32 %result, i32 addrspace(1)* %gep.out
564 define amdgpu_kernel void @s_test_i32_x_sub_64(i32 %x) #0 {
565 ; SI-LABEL: s_test_i32_x_sub_64:
567 ; SI-NEXT: s_load_dword s0, s[0:1], 0x9
568 ; SI-NEXT: s_waitcnt lgkmcnt(0)
569 ; SI-NEXT: s_sub_i32 s0, s0, 64
570 ; SI-NEXT: ;;#ASMSTART
575 ; VI-LABEL: s_test_i32_x_sub_64:
577 ; VI-NEXT: s_load_dword s0, s[0:1], 0x24
578 ; VI-NEXT: s_waitcnt lgkmcnt(0)
579 ; VI-NEXT: s_sub_i32 s0, s0, 64
580 ; VI-NEXT: ;;#ASMSTART
585 ; GFX9-LABEL: s_test_i32_x_sub_64:
587 ; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24
588 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
589 ; GFX9-NEXT: s_sub_i32 s0, s0, 64
590 ; GFX9-NEXT: ;;#ASMSTART
591 ; GFX9-NEXT: ; use s0
592 ; GFX9-NEXT: ;;#ASMEND
593 ; GFX9-NEXT: s_endpgm
594 %result = sub i32 %x, 64
595 call void asm sideeffect "; use $0", "s"(i32 %result)
599 define amdgpu_kernel void @v_test_i16_x_sub_64(i16 addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
600 ; SI-LABEL: v_test_i16_x_sub_64:
602 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
603 ; SI-NEXT: s_mov_b32 s7, 0xf000
604 ; SI-NEXT: s_mov_b32 s6, 0
605 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
606 ; SI-NEXT: v_mov_b32_e32 v1, 0
607 ; SI-NEXT: s_waitcnt lgkmcnt(0)
608 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
609 ; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64
610 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
611 ; SI-NEXT: s_waitcnt vmcnt(0)
612 ; SI-NEXT: v_subrev_i32_e32 v2, vcc, 64, v2
613 ; SI-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64
616 ; VI-LABEL: v_test_i16_x_sub_64:
618 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
619 ; VI-NEXT: v_lshlrev_b32_e32 v2, 1, v0
620 ; VI-NEXT: s_waitcnt lgkmcnt(0)
621 ; VI-NEXT: v_mov_b32_e32 v1, s3
622 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
623 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
624 ; VI-NEXT: flat_load_ushort v3, v[0:1]
625 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
626 ; VI-NEXT: v_mov_b32_e32 v1, s1
627 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
628 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
629 ; VI-NEXT: v_subrev_u16_e32 v2, 64, v3
630 ; VI-NEXT: flat_store_short v[0:1], v2
633 ; GFX9-LABEL: v_test_i16_x_sub_64:
635 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
636 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 1, v0
637 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
638 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
639 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
640 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
641 ; GFX9-NEXT: global_load_ushort v3, v[0:1], off
642 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
643 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
644 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
645 ; GFX9-NEXT: s_waitcnt vmcnt(0)
646 ; GFX9-NEXT: v_subrev_u16_e32 v2, 64, v3
647 ; GFX9-NEXT: global_store_short v[0:1], v2, off
648 ; GFX9-NEXT: s_endpgm
649 %tid = call i32 @llvm.amdgcn.workitem.id.x()
650 %tid.ext = sext i32 %tid to i64
651 %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 %tid.ext
652 %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i64 %tid.ext
653 %x = load i16, i16 addrspace(1)* %gep
654 %result = sub i16 %x, 64
655 store i16 %result, i16 addrspace(1)* %gep.out
659 define amdgpu_kernel void @v_test_i16_x_sub_64_multi_use(i16 addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
660 ; SI-LABEL: v_test_i16_x_sub_64_multi_use:
662 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
663 ; SI-NEXT: s_mov_b32 s7, 0xf000
664 ; SI-NEXT: s_mov_b32 s6, 0
665 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
666 ; SI-NEXT: v_mov_b32_e32 v1, 0
667 ; SI-NEXT: s_waitcnt lgkmcnt(0)
668 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
669 ; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64
670 ; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64
671 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
672 ; SI-NEXT: s_waitcnt vmcnt(1)
673 ; SI-NEXT: v_subrev_i32_e32 v2, vcc, 64, v2
674 ; SI-NEXT: s_waitcnt vmcnt(0)
675 ; SI-NEXT: v_subrev_i32_e32 v3, vcc, 64, v3
676 ; SI-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64
677 ; SI-NEXT: buffer_store_short v3, v[0:1], s[0:3], 0 addr64
680 ; VI-LABEL: v_test_i16_x_sub_64_multi_use:
682 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
683 ; VI-NEXT: v_lshlrev_b32_e32 v2, 1, v0
684 ; VI-NEXT: s_waitcnt lgkmcnt(0)
685 ; VI-NEXT: v_mov_b32_e32 v1, s3
686 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
687 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
688 ; VI-NEXT: flat_load_ushort v4, v[0:1]
689 ; VI-NEXT: flat_load_ushort v0, v[0:1]
690 ; VI-NEXT: v_mov_b32_e32 v3, s1
691 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
692 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
693 ; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
694 ; VI-NEXT: v_subrev_u16_e32 v1, 64, v4
695 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
696 ; VI-NEXT: v_subrev_u16_e32 v0, 64, v0
697 ; VI-NEXT: flat_store_short v[2:3], v1
698 ; VI-NEXT: flat_store_short v[2:3], v0
701 ; GFX9-LABEL: v_test_i16_x_sub_64_multi_use:
703 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
704 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 1, v0
705 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
706 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
707 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
708 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
709 ; GFX9-NEXT: global_load_ushort v4, v[0:1], off
710 ; GFX9-NEXT: global_load_ushort v0, v[0:1], off
711 ; GFX9-NEXT: v_mov_b32_e32 v3, s1
712 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
713 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
714 ; GFX9-NEXT: s_waitcnt vmcnt(1)
715 ; GFX9-NEXT: v_subrev_u16_e32 v1, 64, v4
716 ; GFX9-NEXT: s_waitcnt vmcnt(0)
717 ; GFX9-NEXT: v_subrev_u16_e32 v0, 64, v0
718 ; GFX9-NEXT: global_store_short v[2:3], v1, off
719 ; GFX9-NEXT: global_store_short v[2:3], v0, off
720 ; GFX9-NEXT: s_endpgm
721 %tid = call i32 @llvm.amdgcn.workitem.id.x()
722 %tid.ext = sext i32 %tid to i64
723 %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 %tid.ext
724 %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i64 %tid.ext
725 %x = load volatile i16, i16 addrspace(1)* %gep
726 %y = load volatile i16, i16 addrspace(1)* %gep
727 %result0 = sub i16 %x, 64
728 %result1 = sub i16 %y, 64
729 store volatile i16 %result0, i16 addrspace(1)* %gep.out
730 store volatile i16 %result1, i16 addrspace(1)* %gep.out
734 define amdgpu_kernel void @v_test_v2i16_x_sub_64_64(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
735 ; SI-LABEL: v_test_v2i16_x_sub_64_64:
737 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
738 ; SI-NEXT: s_mov_b32 s7, 0xf000
739 ; SI-NEXT: s_mov_b32 s6, 0
740 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
741 ; SI-NEXT: v_mov_b32_e32 v1, 0
742 ; SI-NEXT: s_waitcnt lgkmcnt(0)
743 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
744 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
745 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
746 ; SI-NEXT: s_waitcnt vmcnt(0)
747 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
748 ; SI-NEXT: v_subrev_i32_e32 v2, vcc, 64, v2
749 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
750 ; SI-NEXT: v_or_b32_e32 v2, v3, v2
751 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0xffc00000, v2
752 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
755 ; VI-LABEL: v_test_v2i16_x_sub_64_64:
757 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
758 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
759 ; VI-NEXT: v_mov_b32_e32 v3, 64
760 ; VI-NEXT: s_waitcnt lgkmcnt(0)
761 ; VI-NEXT: v_mov_b32_e32 v1, s3
762 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
763 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
764 ; VI-NEXT: flat_load_dword v4, v[0:1]
765 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
766 ; VI-NEXT: v_mov_b32_e32 v1, s1
767 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
768 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
769 ; VI-NEXT: v_add_u16_e32 v2, 0xffffffc0, v4
770 ; VI-NEXT: v_sub_u16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
771 ; VI-NEXT: v_or_b32_e32 v2, v2, v3
772 ; VI-NEXT: flat_store_dword v[0:1], v2
775 ; GFX9-LABEL: v_test_v2i16_x_sub_64_64:
777 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
778 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
779 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
780 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
781 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
782 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
783 ; GFX9-NEXT: global_load_dword v3, v[0:1], off
784 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
785 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
786 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
787 ; GFX9-NEXT: s_waitcnt vmcnt(0)
788 ; GFX9-NEXT: v_pk_sub_i16 v2, v3, 64 op_sel_hi:[1,0]
789 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
790 ; GFX9-NEXT: s_endpgm
791 %tid = call i32 @llvm.amdgcn.workitem.id.x()
792 %tid.ext = sext i32 %tid to i64
793 %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
794 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
795 %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep
796 %result = sub <2 x i16> %x, <i16 64, i16 64>
797 store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out
801 define amdgpu_kernel void @v_test_v2i16_x_sub_7_64(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
802 ; SI-LABEL: v_test_v2i16_x_sub_7_64:
804 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
805 ; SI-NEXT: s_mov_b32 s7, 0xf000
806 ; SI-NEXT: s_mov_b32 s6, 0
807 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
808 ; SI-NEXT: v_mov_b32_e32 v1, 0
809 ; SI-NEXT: s_waitcnt lgkmcnt(0)
810 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
811 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
812 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
813 ; SI-NEXT: s_waitcnt vmcnt(0)
814 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
815 ; SI-NEXT: v_add_i32_e32 v2, vcc, -7, v2
816 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
817 ; SI-NEXT: v_or_b32_e32 v2, v3, v2
818 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0xffc00000, v2
819 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
822 ; VI-LABEL: v_test_v2i16_x_sub_7_64:
824 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
825 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
826 ; VI-NEXT: v_mov_b32_e32 v3, 64
827 ; VI-NEXT: s_waitcnt lgkmcnt(0)
828 ; VI-NEXT: v_mov_b32_e32 v1, s3
829 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
830 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
831 ; VI-NEXT: flat_load_dword v4, v[0:1]
832 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
833 ; VI-NEXT: v_mov_b32_e32 v1, s1
834 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
835 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
836 ; VI-NEXT: v_add_u16_e32 v2, -7, v4
837 ; VI-NEXT: v_sub_u16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
838 ; VI-NEXT: v_or_b32_e32 v2, v2, v3
839 ; VI-NEXT: flat_store_dword v[0:1], v2
842 ; GFX9-LABEL: v_test_v2i16_x_sub_7_64:
844 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
845 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
846 ; GFX9-NEXT: s_mov_b32 s4, 0x400007
847 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
848 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
849 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
850 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
851 ; GFX9-NEXT: global_load_dword v3, v[0:1], off
852 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
853 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
854 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
855 ; GFX9-NEXT: s_waitcnt vmcnt(0)
856 ; GFX9-NEXT: v_pk_sub_i16 v2, v3, s4
857 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
858 ; GFX9-NEXT: s_endpgm
859 %tid = call i32 @llvm.amdgcn.workitem.id.x()
860 %tid.ext = sext i32 %tid to i64
861 %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
862 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
863 %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep
864 %result = sub <2 x i16> %x, <i16 7, i16 64>
865 store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out
869 define amdgpu_kernel void @v_test_v2i16_x_sub_64_123(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
870 ; SI-LABEL: v_test_v2i16_x_sub_64_123:
872 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
873 ; SI-NEXT: s_mov_b32 s7, 0xf000
874 ; SI-NEXT: s_mov_b32 s6, 0
875 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
876 ; SI-NEXT: v_mov_b32_e32 v1, 0
877 ; SI-NEXT: s_waitcnt lgkmcnt(0)
878 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
879 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
880 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
881 ; SI-NEXT: s_waitcnt vmcnt(0)
882 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
883 ; SI-NEXT: v_subrev_i32_e32 v2, vcc, 64, v2
884 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
885 ; SI-NEXT: v_or_b32_e32 v2, v3, v2
886 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0xff850000, v2
887 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
890 ; VI-LABEL: v_test_v2i16_x_sub_64_123:
892 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
893 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
894 ; VI-NEXT: v_mov_b32_e32 v3, 0xffffff85
895 ; VI-NEXT: s_waitcnt lgkmcnt(0)
896 ; VI-NEXT: v_mov_b32_e32 v1, s3
897 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
898 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
899 ; VI-NEXT: flat_load_dword v4, v[0:1]
900 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
901 ; VI-NEXT: v_mov_b32_e32 v1, s1
902 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
903 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
904 ; VI-NEXT: v_add_u16_e32 v2, 0xffffffc0, v4
905 ; VI-NEXT: v_add_u16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
906 ; VI-NEXT: v_or_b32_e32 v2, v2, v3
907 ; VI-NEXT: flat_store_dword v[0:1], v2
910 ; GFX9-LABEL: v_test_v2i16_x_sub_64_123:
912 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
913 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
914 ; GFX9-NEXT: s_mov_b32 s4, 0x7b0040
915 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
916 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
917 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
918 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
919 ; GFX9-NEXT: global_load_dword v3, v[0:1], off
920 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
921 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
922 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
923 ; GFX9-NEXT: s_waitcnt vmcnt(0)
924 ; GFX9-NEXT: v_pk_sub_i16 v2, v3, s4
925 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
926 ; GFX9-NEXT: s_endpgm
927 %tid = call i32 @llvm.amdgcn.workitem.id.x()
928 %tid.ext = sext i32 %tid to i64
929 %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
930 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
931 %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep
932 %result = sub <2 x i16> %x, <i16 64, i16 123>
933 store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out
937 ; Can fold 0 and inline immediate in other half.
938 define amdgpu_kernel void @v_test_v2i16_x_sub_7_0(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
939 ; SI-LABEL: v_test_v2i16_x_sub_7_0:
941 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
942 ; SI-NEXT: s_mov_b32 s7, 0xf000
943 ; SI-NEXT: s_mov_b32 s6, 0
944 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
945 ; SI-NEXT: v_mov_b32_e32 v1, 0
946 ; SI-NEXT: s_waitcnt lgkmcnt(0)
947 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
948 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
949 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
950 ; SI-NEXT: s_waitcnt vmcnt(0)
951 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
952 ; SI-NEXT: v_add_i32_e32 v2, vcc, -7, v2
953 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
954 ; SI-NEXT: v_or_b32_e32 v2, v2, v3
955 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
958 ; VI-LABEL: v_test_v2i16_x_sub_7_0:
960 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
961 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
962 ; VI-NEXT: s_waitcnt lgkmcnt(0)
963 ; VI-NEXT: v_mov_b32_e32 v1, s3
964 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
965 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
966 ; VI-NEXT: flat_load_dword v3, v[0:1]
967 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
968 ; VI-NEXT: v_mov_b32_e32 v1, s1
969 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
970 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
971 ; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
972 ; VI-NEXT: v_add_u16_e32 v3, -7, v3
973 ; VI-NEXT: v_or_b32_e32 v2, v3, v2
974 ; VI-NEXT: flat_store_dword v[0:1], v2
977 ; GFX9-LABEL: v_test_v2i16_x_sub_7_0:
979 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
980 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
981 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
982 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
983 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
984 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
985 ; GFX9-NEXT: global_load_dword v3, v[0:1], off
986 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
987 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
988 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
989 ; GFX9-NEXT: s_waitcnt vmcnt(0)
990 ; GFX9-NEXT: v_pk_sub_i16 v2, v3, 7
991 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
992 ; GFX9-NEXT: s_endpgm
993 %tid = call i32 @llvm.amdgcn.workitem.id.x()
994 %tid.ext = sext i32 %tid to i64
995 %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
996 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
997 %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep
998 %result = sub <2 x i16> %x, <i16 7, i16 0>
999 store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out
1003 ; Can fold 0 and inline immediate in other half.
1004 define amdgpu_kernel void @v_test_v2i16_x_sub_0_16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
1005 ; SI-LABEL: v_test_v2i16_x_sub_0_16:
1007 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1008 ; SI-NEXT: s_mov_b32 s7, 0xf000
1009 ; SI-NEXT: s_mov_b32 s6, 0
1010 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1011 ; SI-NEXT: v_mov_b32_e32 v1, 0
1012 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1013 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
1014 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1015 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
1016 ; SI-NEXT: s_waitcnt vmcnt(0)
1017 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0xfff00000, v2
1018 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1021 ; VI-LABEL: v_test_v2i16_x_sub_0_16:
1023 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1024 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1025 ; VI-NEXT: v_mov_b32_e32 v3, -16
1026 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1027 ; VI-NEXT: v_mov_b32_e32 v1, s3
1028 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1029 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1030 ; VI-NEXT: flat_load_dword v4, v[0:1]
1031 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
1032 ; VI-NEXT: v_mov_b32_e32 v1, s1
1033 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1034 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1035 ; VI-NEXT: v_add_u16_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1036 ; VI-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1037 ; VI-NEXT: flat_store_dword v[0:1], v2
1040 ; GFX9-LABEL: v_test_v2i16_x_sub_0_16:
1042 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1043 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1044 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1045 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
1046 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
1047 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1048 ; GFX9-NEXT: global_load_dword v3, v[0:1], off
1049 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
1050 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
1051 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1052 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1053 ; GFX9-NEXT: v_pk_sub_i16 v2, v3, 16 op_sel:[0,1] op_sel_hi:[1,0]
1054 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
1055 ; GFX9-NEXT: s_endpgm
1056 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1057 %tid.ext = sext i32 %tid to i64
1058 %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
1059 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
1060 %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep
1061 %result = sub <2 x i16> %x, <i16 0, i16 16>
1062 store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out
1066 define amdgpu_kernel void @v_test_v2i16_x_sub_0_1_0(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
1067 ; SI-LABEL: v_test_v2i16_x_sub_0_1_0:
1069 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1070 ; SI-NEXT: s_mov_b32 s7, 0xf000
1071 ; SI-NEXT: s_mov_b32 s6, 0
1072 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1073 ; SI-NEXT: v_mov_b32_e32 v1, 0
1074 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1075 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
1076 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1077 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
1078 ; SI-NEXT: s_waitcnt vmcnt(0)
1079 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x3c000000, v2
1080 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1083 ; VI-LABEL: v_test_v2i16_x_sub_0_1_0:
1085 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1086 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1087 ; VI-NEXT: v_mov_b32_e32 v3, 0x3c00
1088 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1089 ; VI-NEXT: v_mov_b32_e32 v1, s3
1090 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1091 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1092 ; VI-NEXT: flat_load_dword v4, v[0:1]
1093 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
1094 ; VI-NEXT: v_mov_b32_e32 v1, s1
1095 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1096 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1097 ; VI-NEXT: v_add_u16_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1098 ; VI-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1099 ; VI-NEXT: flat_store_dword v[0:1], v2
1102 ; GFX9-LABEL: v_test_v2i16_x_sub_0_1_0:
1104 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1105 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1106 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1107 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
1108 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
1109 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1110 ; GFX9-NEXT: global_load_dword v3, v[0:1], off
1111 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
1112 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
1113 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1114 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1115 ; GFX9-NEXT: v_pk_sub_i16 v2, v3, -4.0 op_sel:[0,1] op_sel_hi:[1,0]
1116 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
1117 ; GFX9-NEXT: s_endpgm
1118 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1119 %tid.ext = sext i32 %tid to i64
1120 %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
1121 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
1122 %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep
1123 %result = sub <2 x i16> %x, <i16 0, i16 -15360>
1124 store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out
1128 define amdgpu_kernel void @v_test_v2i16_x_sub_0_neg1_0(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
1129 ; SI-LABEL: v_test_v2i16_x_sub_0_neg1_0:
1131 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1132 ; SI-NEXT: s_mov_b32 s7, 0xf000
1133 ; SI-NEXT: s_mov_b32 s6, 0
1134 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1135 ; SI-NEXT: v_mov_b32_e32 v1, 0
1136 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1137 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
1138 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1139 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
1140 ; SI-NEXT: s_waitcnt vmcnt(0)
1141 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0xbc000000, v2
1142 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1145 ; VI-LABEL: v_test_v2i16_x_sub_0_neg1_0:
1147 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1148 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1149 ; VI-NEXT: v_mov_b32_e32 v3, 0xffffbc00
1150 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1151 ; VI-NEXT: v_mov_b32_e32 v1, s3
1152 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1153 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1154 ; VI-NEXT: flat_load_dword v4, v[0:1]
1155 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
1156 ; VI-NEXT: v_mov_b32_e32 v1, s1
1157 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1158 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1159 ; VI-NEXT: v_add_u16_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1160 ; VI-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1161 ; VI-NEXT: flat_store_dword v[0:1], v2
1164 ; GFX9-LABEL: v_test_v2i16_x_sub_0_neg1_0:
1166 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1167 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1168 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1169 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
1170 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
1171 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1172 ; GFX9-NEXT: global_load_dword v3, v[0:1], off
1173 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
1174 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
1175 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1176 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1177 ; GFX9-NEXT: v_pk_sub_i16 v2, v3, 4.0 op_sel:[0,1] op_sel_hi:[1,0]
1178 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
1179 ; GFX9-NEXT: s_endpgm
1180 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1181 %tid.ext = sext i32 %tid to i64
1182 %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
1183 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
1184 %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep
1185 %result = sub <2 x i16> %x, <i16 0, i16 17408>
1186 store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out
1190 ; -32 isn't an inline immediate, but 32 is
1191 define amdgpu_kernel void @v_test_v2i16_x_add_neg32_neg32(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
1192 ; SI-LABEL: v_test_v2i16_x_add_neg32_neg32:
1194 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1195 ; SI-NEXT: s_mov_b32 s7, 0xf000
1196 ; SI-NEXT: s_mov_b32 s6, 0
1197 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1198 ; SI-NEXT: v_mov_b32_e32 v1, 0
1199 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1200 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
1201 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1202 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
1203 ; SI-NEXT: s_waitcnt vmcnt(0)
1204 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
1205 ; SI-NEXT: v_subrev_i32_e32 v2, vcc, 32, v2
1206 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
1207 ; SI-NEXT: v_or_b32_e32 v2, v3, v2
1208 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0xffe00000, v2
1209 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1212 ; VI-LABEL: v_test_v2i16_x_add_neg32_neg32:
1214 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1215 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1216 ; VI-NEXT: v_mov_b32_e32 v3, 32
1217 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1218 ; VI-NEXT: v_mov_b32_e32 v1, s3
1219 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1220 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1221 ; VI-NEXT: flat_load_dword v4, v[0:1]
1222 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
1223 ; VI-NEXT: v_mov_b32_e32 v1, s1
1224 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1225 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1226 ; VI-NEXT: v_add_u16_e32 v2, 0xffffffe0, v4
1227 ; VI-NEXT: v_sub_u16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1228 ; VI-NEXT: v_or_b32_e32 v2, v2, v3
1229 ; VI-NEXT: flat_store_dword v[0:1], v2
1232 ; GFX9-LABEL: v_test_v2i16_x_add_neg32_neg32:
1234 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1235 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1236 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1237 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
1238 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
1239 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1240 ; GFX9-NEXT: global_load_dword v3, v[0:1], off
1241 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
1242 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
1243 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1244 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1245 ; GFX9-NEXT: v_pk_sub_u16 v2, v3, 32 op_sel_hi:[1,0]
1246 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
1247 ; GFX9-NEXT: s_endpgm
1248 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1249 %tid.ext = sext i32 %tid to i64
1250 %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
1251 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
1252 %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep
1253 %result = add <2 x i16> %x, <i16 -32, i16 -32>
1254 store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out
1258 define amdgpu_kernel void @v_test_v2i16_x_add_0_neg32(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
1259 ; SI-LABEL: v_test_v2i16_x_add_0_neg32:
1261 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1262 ; SI-NEXT: s_mov_b32 s7, 0xf000
1263 ; SI-NEXT: s_mov_b32 s6, 0
1264 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1265 ; SI-NEXT: v_mov_b32_e32 v1, 0
1266 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1267 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
1268 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1269 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
1270 ; SI-NEXT: s_waitcnt vmcnt(0)
1271 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0xffe00000, v2
1272 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1275 ; VI-LABEL: v_test_v2i16_x_add_0_neg32:
1277 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1278 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1279 ; VI-NEXT: v_mov_b32_e32 v3, 32
1280 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1281 ; VI-NEXT: v_mov_b32_e32 v1, s3
1282 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1283 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1284 ; VI-NEXT: flat_load_dword v4, v[0:1]
1285 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
1286 ; VI-NEXT: v_mov_b32_e32 v1, s1
1287 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1288 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1289 ; VI-NEXT: v_sub_u16_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1290 ; VI-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1291 ; VI-NEXT: flat_store_dword v[0:1], v2
1294 ; GFX9-LABEL: v_test_v2i16_x_add_0_neg32:
1296 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1297 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1298 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1299 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
1300 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
1301 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1302 ; GFX9-NEXT: global_load_dword v3, v[0:1], off
1303 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
1304 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
1305 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1306 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1307 ; GFX9-NEXT: v_pk_sub_u16 v2, v3, 32 op_sel:[0,1] op_sel_hi:[1,0]
1308 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
1309 ; GFX9-NEXT: s_endpgm
1310 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1311 %tid.ext = sext i32 %tid to i64
1312 %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
1313 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
1314 %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep
1315 %result = add <2 x i16> %x, <i16 0, i16 -32>
1316 store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out
1320 define amdgpu_kernel void @v_test_v2i16_x_add_neg32_0(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
1321 ; SI-LABEL: v_test_v2i16_x_add_neg32_0:
1323 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1324 ; SI-NEXT: s_mov_b32 s7, 0xf000
1325 ; SI-NEXT: s_mov_b32 s6, 0
1326 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1327 ; SI-NEXT: v_mov_b32_e32 v1, 0
1328 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1329 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
1330 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1331 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
1332 ; SI-NEXT: s_waitcnt vmcnt(0)
1333 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
1334 ; SI-NEXT: v_subrev_i32_e32 v2, vcc, 32, v2
1335 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
1336 ; SI-NEXT: v_or_b32_e32 v2, v2, v3
1337 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1340 ; VI-LABEL: v_test_v2i16_x_add_neg32_0:
1342 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1343 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1344 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1345 ; VI-NEXT: v_mov_b32_e32 v1, s3
1346 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1347 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1348 ; VI-NEXT: flat_load_dword v3, v[0:1]
1349 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
1350 ; VI-NEXT: v_mov_b32_e32 v1, s1
1351 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1352 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1353 ; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
1354 ; VI-NEXT: v_add_u16_e32 v3, 0xffffffe0, v3
1355 ; VI-NEXT: v_or_b32_e32 v2, v3, v2
1356 ; VI-NEXT: flat_store_dword v[0:1], v2
1359 ; GFX9-LABEL: v_test_v2i16_x_add_neg32_0:
1361 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1362 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1363 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1364 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
1365 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
1366 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1367 ; GFX9-NEXT: global_load_dword v3, v[0:1], off
1368 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
1369 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
1370 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1371 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1372 ; GFX9-NEXT: v_pk_sub_u16 v2, v3, 32
1373 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
1374 ; GFX9-NEXT: s_endpgm
1375 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1376 %tid.ext = sext i32 %tid to i64
1377 %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
1378 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
1379 %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep
1380 %result = add <2 x i16> %x, <i16 -32, i16 0>
1381 store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out
1385 ; 16 and -16 are both inline immediates
1386 define amdgpu_kernel void @v_test_v2i16_x_add_neg16_neg16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
1387 ; SI-LABEL: v_test_v2i16_x_add_neg16_neg16:
1389 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1390 ; SI-NEXT: s_mov_b32 s7, 0xf000
1391 ; SI-NEXT: s_mov_b32 s6, 0
1392 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1393 ; SI-NEXT: v_mov_b32_e32 v1, 0
1394 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1395 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
1396 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1397 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
1398 ; SI-NEXT: s_waitcnt vmcnt(0)
1399 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
1400 ; SI-NEXT: v_add_i32_e32 v2, vcc, -16, v2
1401 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
1402 ; SI-NEXT: v_or_b32_e32 v2, v3, v2
1403 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0xfff00000, v2
1404 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1407 ; VI-LABEL: v_test_v2i16_x_add_neg16_neg16:
1409 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1410 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1411 ; VI-NEXT: v_mov_b32_e32 v3, -16
1412 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1413 ; VI-NEXT: v_mov_b32_e32 v1, s3
1414 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1415 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1416 ; VI-NEXT: flat_load_dword v4, v[0:1]
1417 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
1418 ; VI-NEXT: v_mov_b32_e32 v1, s1
1419 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1420 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1421 ; VI-NEXT: v_add_u16_e32 v2, -16, v4
1422 ; VI-NEXT: v_add_u16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1423 ; VI-NEXT: v_or_b32_e32 v2, v2, v3
1424 ; VI-NEXT: flat_store_dword v[0:1], v2
1427 ; GFX9-LABEL: v_test_v2i16_x_add_neg16_neg16:
1429 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1430 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1431 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1432 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
1433 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
1434 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1435 ; GFX9-NEXT: global_load_dword v3, v[0:1], off
1436 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
1437 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
1438 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1439 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1440 ; GFX9-NEXT: v_pk_sub_u16 v2, v3, 16 op_sel_hi:[1,0]
1441 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
1442 ; GFX9-NEXT: s_endpgm
1443 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1444 %tid.ext = sext i32 %tid to i64
1445 %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
1446 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
1447 %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep
1448 %result = add <2 x i16> %x, <i16 -16, i16 -16>
1449 store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out
1453 define amdgpu_kernel void @v_test_v2i16_x_add_0_neg16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
1454 ; SI-LABEL: v_test_v2i16_x_add_0_neg16:
1456 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1457 ; SI-NEXT: s_mov_b32 s7, 0xf000
1458 ; SI-NEXT: s_mov_b32 s6, 0
1459 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1460 ; SI-NEXT: v_mov_b32_e32 v1, 0
1461 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1462 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
1463 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1464 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
1465 ; SI-NEXT: s_waitcnt vmcnt(0)
1466 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0xfff00000, v2
1467 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1470 ; VI-LABEL: v_test_v2i16_x_add_0_neg16:
1472 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1473 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1474 ; VI-NEXT: v_mov_b32_e32 v3, -16
1475 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1476 ; VI-NEXT: v_mov_b32_e32 v1, s3
1477 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1478 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1479 ; VI-NEXT: flat_load_dword v4, v[0:1]
1480 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
1481 ; VI-NEXT: v_mov_b32_e32 v1, s1
1482 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1483 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1484 ; VI-NEXT: v_add_u16_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1485 ; VI-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1486 ; VI-NEXT: flat_store_dword v[0:1], v2
1489 ; GFX9-LABEL: v_test_v2i16_x_add_0_neg16:
1491 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1492 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1493 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1494 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
1495 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
1496 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1497 ; GFX9-NEXT: global_load_dword v3, v[0:1], off
1498 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
1499 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
1500 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1501 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1502 ; GFX9-NEXT: v_pk_sub_u16 v2, v3, 16 op_sel:[0,1] op_sel_hi:[1,0]
1503 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
1504 ; GFX9-NEXT: s_endpgm
1505 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1506 %tid.ext = sext i32 %tid to i64
1507 %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
1508 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
1509 %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep
1510 %result = add <2 x i16> %x, <i16 0, i16 -16>
1511 store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out
1515 define amdgpu_kernel void @v_test_v2i16_x_add_neg16_0(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
1516 ; SI-LABEL: v_test_v2i16_x_add_neg16_0:
1518 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1519 ; SI-NEXT: s_mov_b32 s7, 0xf000
1520 ; SI-NEXT: s_mov_b32 s6, 0
1521 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1522 ; SI-NEXT: v_mov_b32_e32 v1, 0
1523 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1524 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
1525 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1526 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
1527 ; SI-NEXT: s_waitcnt vmcnt(0)
1528 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
1529 ; SI-NEXT: v_add_i32_e32 v2, vcc, -16, v2
1530 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
1531 ; SI-NEXT: v_or_b32_e32 v2, v2, v3
1532 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1535 ; VI-LABEL: v_test_v2i16_x_add_neg16_0:
1537 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1538 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1539 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1540 ; VI-NEXT: v_mov_b32_e32 v1, s3
1541 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1542 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1543 ; VI-NEXT: flat_load_dword v3, v[0:1]
1544 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
1545 ; VI-NEXT: v_mov_b32_e32 v1, s1
1546 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1547 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1548 ; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
1549 ; VI-NEXT: v_add_u16_e32 v3, -16, v3
1550 ; VI-NEXT: v_or_b32_e32 v2, v3, v2
1551 ; VI-NEXT: flat_store_dword v[0:1], v2
1554 ; GFX9-LABEL: v_test_v2i16_x_add_neg16_0:
1556 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1557 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1558 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1559 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
1560 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
1561 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1562 ; GFX9-NEXT: global_load_dword v3, v[0:1], off
1563 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
1564 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
1565 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1566 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1567 ; GFX9-NEXT: v_pk_sub_u16 v2, v3, 16
1568 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
1569 ; GFX9-NEXT: s_endpgm
1570 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1571 %tid.ext = sext i32 %tid to i64
1572 %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
1573 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
1574 %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep
1575 %result = add <2 x i16> %x, <i16 -16, i16 0>
1576 store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out
1580 define amdgpu_kernel void @v_test_v2i16_x_add_neg_fpone(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
1581 ; SI-LABEL: v_test_v2i16_x_add_neg_fpone:
1583 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1584 ; SI-NEXT: s_mov_b32 s7, 0xf000
1585 ; SI-NEXT: s_mov_b32 s6, 0
1586 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1587 ; SI-NEXT: v_mov_b32_e32 v1, 0
1588 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1589 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
1590 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1591 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
1592 ; SI-NEXT: s_waitcnt vmcnt(0)
1593 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
1594 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0xffffc400, v2
1595 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
1596 ; SI-NEXT: v_or_b32_e32 v2, v3, v2
1597 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0xc4000000, v2
1598 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1601 ; VI-LABEL: v_test_v2i16_x_add_neg_fpone:
1603 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1604 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1605 ; VI-NEXT: v_mov_b32_e32 v3, 0xffffc400
1606 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1607 ; VI-NEXT: v_mov_b32_e32 v1, s3
1608 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1609 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1610 ; VI-NEXT: flat_load_dword v4, v[0:1]
1611 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
1612 ; VI-NEXT: v_mov_b32_e32 v1, s1
1613 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1614 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1615 ; VI-NEXT: v_add_u16_e32 v2, 0xffffc400, v4
1616 ; VI-NEXT: v_add_u16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1617 ; VI-NEXT: v_or_b32_e32 v2, v2, v3
1618 ; VI-NEXT: flat_store_dword v[0:1], v2
1621 ; GFX9-LABEL: v_test_v2i16_x_add_neg_fpone:
1623 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1624 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1625 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1626 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
1627 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
1628 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1629 ; GFX9-NEXT: global_load_dword v3, v[0:1], off
1630 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
1631 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
1632 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1633 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1634 ; GFX9-NEXT: v_pk_sub_u16 v2, v3, 1.0 op_sel_hi:[1,0]
1635 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
1636 ; GFX9-NEXT: s_endpgm
1637 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1638 %tid.ext = sext i32 %tid to i64
1639 %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
1640 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
1641 %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep
1642 %result = add <2 x i16> %x, <i16 -15360, i16 -15360>
1643 store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out
1647 define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfpone(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
1648 ; SI-LABEL: v_test_v2i16_x_add_neg_negfpone:
1650 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1651 ; SI-NEXT: s_mov_b32 s7, 0xf000
1652 ; SI-NEXT: s_mov_b32 s6, 0
1653 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1654 ; SI-NEXT: v_mov_b32_e32 v1, 0
1655 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1656 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
1657 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1658 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
1659 ; SI-NEXT: s_waitcnt vmcnt(0)
1660 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
1661 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4400, v2
1662 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
1663 ; SI-NEXT: v_or_b32_e32 v2, v3, v2
1664 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44000000, v2
1665 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1668 ; VI-LABEL: v_test_v2i16_x_add_neg_negfpone:
1670 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1671 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1672 ; VI-NEXT: v_mov_b32_e32 v3, 0x4400
1673 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1674 ; VI-NEXT: v_mov_b32_e32 v1, s3
1675 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1676 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1677 ; VI-NEXT: flat_load_dword v4, v[0:1]
1678 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
1679 ; VI-NEXT: v_mov_b32_e32 v1, s1
1680 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1681 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1682 ; VI-NEXT: v_add_u16_e32 v2, 4.0, v4
1683 ; VI-NEXT: v_add_u16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1684 ; VI-NEXT: v_or_b32_e32 v2, v2, v3
1685 ; VI-NEXT: flat_store_dword v[0:1], v2
1688 ; GFX9-LABEL: v_test_v2i16_x_add_neg_negfpone:
1690 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1691 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1692 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1693 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
1694 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
1695 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1696 ; GFX9-NEXT: global_load_dword v3, v[0:1], off
1697 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
1698 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
1699 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1700 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1701 ; GFX9-NEXT: v_pk_sub_u16 v2, v3, -1.0 op_sel_hi:[1,0]
1702 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
1703 ; GFX9-NEXT: s_endpgm
1704 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1705 %tid.ext = sext i32 %tid to i64
1706 %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
1707 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
1708 %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep
1709 %result = add <2 x i16> %x, <i16 17408, i16 17408>
1710 store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out
1714 define amdgpu_kernel void @v_test_v2i16_x_add_neg_fptwo(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
1715 ; SI-LABEL: v_test_v2i16_x_add_neg_fptwo:
1717 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1718 ; SI-NEXT: s_mov_b32 s7, 0xf000
1719 ; SI-NEXT: s_mov_b32 s6, 0
1720 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1721 ; SI-NEXT: v_mov_b32_e32 v1, 0
1722 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1723 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
1724 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1725 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
1726 ; SI-NEXT: s_waitcnt vmcnt(0)
1727 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
1728 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4000, v2
1729 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
1730 ; SI-NEXT: v_or_b32_e32 v2, v3, v2
1731 ; SI-NEXT: v_add_i32_e32 v2, vcc, 2.0, v2
1732 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1735 ; VI-LABEL: v_test_v2i16_x_add_neg_fptwo:
1737 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1738 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1739 ; VI-NEXT: v_mov_b32_e32 v3, 0x4000
1740 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1741 ; VI-NEXT: v_mov_b32_e32 v1, s3
1742 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1743 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1744 ; VI-NEXT: flat_load_dword v4, v[0:1]
1745 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
1746 ; VI-NEXT: v_mov_b32_e32 v1, s1
1747 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1748 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1749 ; VI-NEXT: v_add_u16_e32 v2, 2.0, v4
1750 ; VI-NEXT: v_add_u16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1751 ; VI-NEXT: v_or_b32_e32 v2, v2, v3
1752 ; VI-NEXT: flat_store_dword v[0:1], v2
1755 ; GFX9-LABEL: v_test_v2i16_x_add_neg_fptwo:
1757 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1758 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1759 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1760 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
1761 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
1762 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1763 ; GFX9-NEXT: global_load_dword v3, v[0:1], off
1764 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
1765 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
1766 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1767 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1768 ; GFX9-NEXT: v_pk_sub_u16 v2, v3, -2.0 op_sel_hi:[1,0]
1769 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
1770 ; GFX9-NEXT: s_endpgm
1771 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1772 %tid.ext = sext i32 %tid to i64
1773 %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
1774 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
1775 %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep
1776 %result = add <2 x i16> %x, <i16 16384, i16 16384>
1777 store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out
1781 define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfptwo(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
1782 ; SI-LABEL: v_test_v2i16_x_add_neg_negfptwo:
1784 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1785 ; SI-NEXT: s_mov_b32 s7, 0xf000
1786 ; SI-NEXT: s_mov_b32 s6, 0
1787 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1788 ; SI-NEXT: v_mov_b32_e32 v1, 0
1789 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1790 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
1791 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1792 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
1793 ; SI-NEXT: s_waitcnt vmcnt(0)
1794 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
1795 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0xffffc000, v2
1796 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
1797 ; SI-NEXT: v_or_b32_e32 v2, v3, v2
1798 ; SI-NEXT: v_add_i32_e32 v2, vcc, -2.0, v2
1799 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1802 ; VI-LABEL: v_test_v2i16_x_add_neg_negfptwo:
1804 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1805 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1806 ; VI-NEXT: v_mov_b32_e32 v3, 0xffffc000
1807 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1808 ; VI-NEXT: v_mov_b32_e32 v1, s3
1809 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1810 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1811 ; VI-NEXT: flat_load_dword v4, v[0:1]
1812 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
1813 ; VI-NEXT: v_mov_b32_e32 v1, s1
1814 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1815 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1816 ; VI-NEXT: v_add_u16_e32 v2, 0xffffc000, v4
1817 ; VI-NEXT: v_add_u16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1818 ; VI-NEXT: v_or_b32_e32 v2, v2, v3
1819 ; VI-NEXT: flat_store_dword v[0:1], v2
1822 ; GFX9-LABEL: v_test_v2i16_x_add_neg_negfptwo:
1824 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1825 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1826 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1827 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
1828 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
1829 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1830 ; GFX9-NEXT: global_load_dword v3, v[0:1], off
1831 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
1832 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
1833 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1834 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1835 ; GFX9-NEXT: v_pk_sub_u16 v2, v3, 2.0 op_sel_hi:[1,0]
1836 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
1837 ; GFX9-NEXT: s_endpgm
1838 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1839 %tid.ext = sext i32 %tid to i64
1840 %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
1841 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
1842 %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep
1843 %result = add <2 x i16> %x, <i16 -16384, i16 -16384>
1844 store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out
1848 define amdgpu_kernel void @v_test_v2i16_x_add_undef_neg32(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
1849 ; SI-LABEL: v_test_v2i16_x_add_undef_neg32:
1851 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1852 ; SI-NEXT: s_mov_b32 s7, 0xf000
1853 ; SI-NEXT: s_mov_b32 s6, 0
1854 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1855 ; SI-NEXT: v_mov_b32_e32 v1, 0
1856 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1857 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
1858 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1859 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
1860 ; SI-NEXT: s_waitcnt vmcnt(0)
1861 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
1862 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0xffe00000, v2
1863 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1866 ; VI-LABEL: v_test_v2i16_x_add_undef_neg32:
1868 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1869 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1870 ; VI-NEXT: v_mov_b32_e32 v3, 32
1871 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1872 ; VI-NEXT: v_mov_b32_e32 v1, s3
1873 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1874 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1875 ; VI-NEXT: flat_load_dword v4, v[0:1]
1876 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
1877 ; VI-NEXT: v_mov_b32_e32 v1, s1
1878 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1879 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1880 ; VI-NEXT: v_sub_u16_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1881 ; VI-NEXT: flat_store_dword v[0:1], v2
1884 ; GFX9-LABEL: v_test_v2i16_x_add_undef_neg32:
1886 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1887 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1888 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1889 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
1890 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
1891 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1892 ; GFX9-NEXT: global_load_dword v3, v[0:1], off
1893 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
1894 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
1895 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1896 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1897 ; GFX9-NEXT: v_pk_sub_u16 v2, v3, 32 op_sel:[0,1] op_sel_hi:[1,0]
1898 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
1899 ; GFX9-NEXT: s_endpgm
1900 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1901 %tid.ext = sext i32 %tid to i64
1902 %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
1903 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
1904 %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep
1905 %result = add <2 x i16> %x, <i16 undef, i16 -32>
1906 store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out
1910 define amdgpu_kernel void @v_test_v2i16_x_add_neg32_undef(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
1911 ; SI-LABEL: v_test_v2i16_x_add_neg32_undef:
1913 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1914 ; SI-NEXT: s_mov_b32 s7, 0xf000
1915 ; SI-NEXT: s_mov_b32 s6, 0
1916 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1917 ; SI-NEXT: v_mov_b32_e32 v1, 0
1918 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1919 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
1920 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1921 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
1922 ; SI-NEXT: s_waitcnt vmcnt(0)
1923 ; SI-NEXT: v_subrev_i32_e32 v2, vcc, 32, v2
1924 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
1925 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1928 ; VI-LABEL: v_test_v2i16_x_add_neg32_undef:
1930 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1931 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1932 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1933 ; VI-NEXT: v_mov_b32_e32 v1, s3
1934 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1935 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1936 ; VI-NEXT: flat_load_dword v3, v[0:1]
1937 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
1938 ; VI-NEXT: v_mov_b32_e32 v1, s1
1939 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1940 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1941 ; VI-NEXT: v_subrev_u16_e32 v2, 32, v3
1942 ; VI-NEXT: flat_store_dword v[0:1], v2
1945 ; GFX9-LABEL: v_test_v2i16_x_add_neg32_undef:
1947 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1948 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1949 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1950 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
1951 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
1952 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1953 ; GFX9-NEXT: global_load_dword v3, v[0:1], off
1954 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
1955 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
1956 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1957 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1958 ; GFX9-NEXT: v_pk_sub_u16 v2, v3, 32
1959 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
1960 ; GFX9-NEXT: s_endpgm
1961 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1962 %tid.ext = sext i32 %tid to i64
1963 %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
1964 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
1965 %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep
1966 %result = add <2 x i16> %x, <i16 -32, i16 undef>
1967 store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out
1971 declare i32 @llvm.amdgcn.workitem.id.x() #1
1973 attributes #0 = { nounwind }
1974 attributes #1 = { nounwind readnone }