1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -amdgpu-codegenprepare-widen-constant-loads=0 -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=SI %s
3 ; RUN: llc -amdgpu-codegenprepare-widen-constant-loads=0 -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=VI %s
4 ; RUN: llc -amdgpu-codegenprepare-widen-constant-loads=0 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX11 %s
6 define amdgpu_kernel void @widen_i16_constant_load(ptr addrspace(4) %arg) {
7 ; SI-LABEL: widen_i16_constant_load:
9 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
10 ; SI-NEXT: s_mov_b32 s3, 0xf000
11 ; SI-NEXT: s_mov_b32 s2, -1
12 ; SI-NEXT: s_waitcnt lgkmcnt(0)
13 ; SI-NEXT: s_load_dword s1, s[0:1], 0x0
14 ; SI-NEXT: s_mov_b32 s0, 0
15 ; SI-NEXT: s_waitcnt lgkmcnt(0)
16 ; SI-NEXT: s_addk_i32 s1, 0x3e7
17 ; SI-NEXT: s_or_b32 s4, s1, 4
18 ; SI-NEXT: s_mov_b32 s1, s0
19 ; SI-NEXT: v_mov_b32_e32 v0, s4
20 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
23 ; VI-LABEL: widen_i16_constant_load:
25 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
26 ; VI-NEXT: v_mov_b32_e32 v0, 0
27 ; VI-NEXT: v_mov_b32_e32 v1, 0
28 ; VI-NEXT: s_waitcnt lgkmcnt(0)
29 ; VI-NEXT: s_load_dword s0, s[0:1], 0x0
30 ; VI-NEXT: s_waitcnt lgkmcnt(0)
31 ; VI-NEXT: s_addk_i32 s0, 0x3e7
32 ; VI-NEXT: s_or_b32 s0, s0, 4
33 ; VI-NEXT: v_mov_b32_e32 v2, s0
34 ; VI-NEXT: flat_store_short v[0:1], v2
37 ; GFX11-LABEL: widen_i16_constant_load:
39 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
40 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
41 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
42 ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
43 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
44 ; GFX11-NEXT: s_addk_i32 s0, 0x3e7
45 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
46 ; GFX11-NEXT: s_or_b32 s0, s0, 4
47 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
48 ; GFX11-NEXT: global_store_b16 v[0:1], v2, off
49 ; GFX11-NEXT: s_endpgm
50 %load = load i16, ptr addrspace(4) %arg, align 4
51 %add = add i16 %load, 999
53 store i16 %or, ptr addrspace(1) null
57 define amdgpu_kernel void @widen_i16_constant_load_zext_i32(ptr addrspace(4) %arg) {
58 ; SI-LABEL: widen_i16_constant_load_zext_i32:
60 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
61 ; SI-NEXT: s_mov_b32 s3, 0xf000
62 ; SI-NEXT: s_mov_b32 s2, -1
63 ; SI-NEXT: s_waitcnt lgkmcnt(0)
64 ; SI-NEXT: s_load_dword s1, s[0:1], 0x0
65 ; SI-NEXT: s_mov_b32 s0, 0
66 ; SI-NEXT: s_waitcnt lgkmcnt(0)
67 ; SI-NEXT: s_and_b32 s1, s1, 0xffff
68 ; SI-NEXT: s_addk_i32 s1, 0x3e7
69 ; SI-NEXT: s_or_b32 s4, s1, 4
70 ; SI-NEXT: s_mov_b32 s1, s0
71 ; SI-NEXT: v_mov_b32_e32 v0, s4
72 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
75 ; VI-LABEL: widen_i16_constant_load_zext_i32:
77 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
78 ; VI-NEXT: v_mov_b32_e32 v0, 0
79 ; VI-NEXT: v_mov_b32_e32 v1, 0
80 ; VI-NEXT: s_waitcnt lgkmcnt(0)
81 ; VI-NEXT: s_load_dword s0, s[0:1], 0x0
82 ; VI-NEXT: s_waitcnt lgkmcnt(0)
83 ; VI-NEXT: s_and_b32 s0, s0, 0xffff
84 ; VI-NEXT: s_addk_i32 s0, 0x3e7
85 ; VI-NEXT: s_or_b32 s0, s0, 4
86 ; VI-NEXT: v_mov_b32_e32 v2, s0
87 ; VI-NEXT: flat_store_dword v[0:1], v2
90 ; GFX11-LABEL: widen_i16_constant_load_zext_i32:
92 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
93 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
94 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
95 ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
96 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
97 ; GFX11-NEXT: s_and_b32 s0, s0, 0xffff
98 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
99 ; GFX11-NEXT: s_addk_i32 s0, 0x3e7
100 ; GFX11-NEXT: s_or_b32 s0, s0, 4
101 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
102 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
103 ; GFX11-NEXT: global_store_b32 v[0:1], v2, off
104 ; GFX11-NEXT: s_endpgm
105 %load = load i16, ptr addrspace(4) %arg, align 4
106 %ext = zext i16 %load to i32
107 %add = add i32 %ext, 999
109 store i32 %or, ptr addrspace(1) null
113 define amdgpu_kernel void @widen_i16_constant_load_sext_i32(ptr addrspace(4) %arg) {
114 ; SI-LABEL: widen_i16_constant_load_sext_i32:
116 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
117 ; SI-NEXT: s_mov_b32 s3, 0xf000
118 ; SI-NEXT: s_mov_b32 s2, -1
119 ; SI-NEXT: s_waitcnt lgkmcnt(0)
120 ; SI-NEXT: s_load_dword s1, s[0:1], 0x0
121 ; SI-NEXT: s_mov_b32 s0, 0
122 ; SI-NEXT: s_waitcnt lgkmcnt(0)
123 ; SI-NEXT: s_sext_i32_i16 s1, s1
124 ; SI-NEXT: s_addk_i32 s1, 0x3e7
125 ; SI-NEXT: s_or_b32 s4, s1, 4
126 ; SI-NEXT: s_mov_b32 s1, s0
127 ; SI-NEXT: v_mov_b32_e32 v0, s4
128 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
131 ; VI-LABEL: widen_i16_constant_load_sext_i32:
133 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
134 ; VI-NEXT: v_mov_b32_e32 v0, 0
135 ; VI-NEXT: v_mov_b32_e32 v1, 0
136 ; VI-NEXT: s_waitcnt lgkmcnt(0)
137 ; VI-NEXT: s_load_dword s0, s[0:1], 0x0
138 ; VI-NEXT: s_waitcnt lgkmcnt(0)
139 ; VI-NEXT: s_sext_i32_i16 s0, s0
140 ; VI-NEXT: s_addk_i32 s0, 0x3e7
141 ; VI-NEXT: s_or_b32 s0, s0, 4
142 ; VI-NEXT: v_mov_b32_e32 v2, s0
143 ; VI-NEXT: flat_store_dword v[0:1], v2
146 ; GFX11-LABEL: widen_i16_constant_load_sext_i32:
148 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
149 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
150 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
151 ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
152 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
153 ; GFX11-NEXT: s_sext_i32_i16 s0, s0
154 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
155 ; GFX11-NEXT: s_addk_i32 s0, 0x3e7
156 ; GFX11-NEXT: s_or_b32 s0, s0, 4
157 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
158 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
159 ; GFX11-NEXT: global_store_b32 v[0:1], v2, off
160 ; GFX11-NEXT: s_endpgm
161 %load = load i16, ptr addrspace(4) %arg, align 4
162 %ext = sext i16 %load to i32
163 %add = add i32 %ext, 999
165 store i32 %or, ptr addrspace(1) null
169 define amdgpu_kernel void @widen_i17_constant_load(ptr addrspace(4) %arg) {
170 ; SI-LABEL: widen_i17_constant_load:
172 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
173 ; SI-NEXT: s_mov_b32 s0, 0
174 ; SI-NEXT: s_mov_b32 s3, 0xf000
175 ; SI-NEXT: s_mov_b32 s2, -1
176 ; SI-NEXT: s_mov_b32 s1, s0
177 ; SI-NEXT: s_waitcnt lgkmcnt(0)
178 ; SI-NEXT: s_load_dword s7, s[4:5], 0x0
179 ; SI-NEXT: s_mov_b32 s4, 2
180 ; SI-NEXT: s_mov_b32 s5, s0
181 ; SI-NEXT: s_mov_b32 s6, s2
182 ; SI-NEXT: s_waitcnt lgkmcnt(0)
183 ; SI-NEXT: s_add_i32 s7, s7, 34
184 ; SI-NEXT: s_or_b32 s7, s7, 4
185 ; SI-NEXT: v_mov_b32_e32 v0, s7
186 ; SI-NEXT: s_bfe_u32 s8, s7, 0x10010
187 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
188 ; SI-NEXT: s_mov_b32 s7, s3
189 ; SI-NEXT: s_waitcnt expcnt(0)
190 ; SI-NEXT: v_mov_b32_e32 v0, s8
191 ; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
194 ; VI-LABEL: widen_i17_constant_load:
196 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
197 ; VI-NEXT: v_mov_b32_e32 v0, 0
198 ; VI-NEXT: v_mov_b32_e32 v1, 0
199 ; VI-NEXT: v_mov_b32_e32 v2, 2
200 ; VI-NEXT: v_mov_b32_e32 v3, 0
201 ; VI-NEXT: s_waitcnt lgkmcnt(0)
202 ; VI-NEXT: s_load_dword s0, s[0:1], 0x0
203 ; VI-NEXT: s_waitcnt lgkmcnt(0)
204 ; VI-NEXT: s_add_i32 s0, s0, 34
205 ; VI-NEXT: s_or_b32 s0, s0, 4
206 ; VI-NEXT: v_mov_b32_e32 v4, s0
207 ; VI-NEXT: s_bfe_u32 s0, s0, 0x10010
208 ; VI-NEXT: flat_store_short v[0:1], v4
209 ; VI-NEXT: v_mov_b32_e32 v0, s0
210 ; VI-NEXT: flat_store_byte v[2:3], v0
213 ; GFX11-LABEL: widen_i17_constant_load:
215 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
216 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
217 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
218 ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
219 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
220 ; GFX11-NEXT: s_add_i32 s0, s0, 34
221 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
222 ; GFX11-NEXT: s_or_b32 s0, s0, 4
223 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v4, s0
224 ; GFX11-NEXT: s_and_b32 s0, s0, 0x1ffff
225 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
226 ; GFX11-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v5, s0
227 ; GFX11-NEXT: v_mov_b32_e32 v3, 0
228 ; GFX11-NEXT: s_clause 0x1
229 ; GFX11-NEXT: global_store_b16 v[0:1], v4, off
230 ; GFX11-NEXT: global_store_d16_hi_b8 v[2:3], v5, off
231 ; GFX11-NEXT: s_endpgm
232 %load = load i17, ptr addrspace(4) %arg, align 4
233 %add = add i17 %load, 34
235 store i17 %or, ptr addrspace(1) null
239 define amdgpu_kernel void @widen_f16_constant_load(ptr addrspace(4) %arg) {
240 ; SI-LABEL: widen_f16_constant_load:
242 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
243 ; SI-NEXT: s_mov_b32 s3, 0xf000
244 ; SI-NEXT: s_mov_b32 s2, -1
245 ; SI-NEXT: s_waitcnt lgkmcnt(0)
246 ; SI-NEXT: s_load_dword s0, s[0:1], 0x0
247 ; SI-NEXT: s_waitcnt lgkmcnt(0)
248 ; SI-NEXT: v_cvt_f32_f16_e32 v0, s0
249 ; SI-NEXT: s_mov_b32 s0, 0
250 ; SI-NEXT: s_mov_b32 s1, s0
251 ; SI-NEXT: v_add_f32_e32 v0, 4.0, v0
252 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
253 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
256 ; VI-LABEL: widen_f16_constant_load:
258 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
259 ; VI-NEXT: v_mov_b32_e32 v0, 0
260 ; VI-NEXT: v_mov_b32_e32 v1, 0
261 ; VI-NEXT: s_waitcnt lgkmcnt(0)
262 ; VI-NEXT: s_load_dword s0, s[0:1], 0x0
263 ; VI-NEXT: s_waitcnt lgkmcnt(0)
264 ; VI-NEXT: v_add_f16_e64 v2, s0, 4.0
265 ; VI-NEXT: flat_store_short v[0:1], v2
268 ; GFX11-LABEL: widen_f16_constant_load:
270 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
271 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
272 ; GFX11-NEXT: v_mov_b32_e32 v1, 0
273 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
274 ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
275 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
276 ; GFX11-NEXT: v_add_f16_e64 v2, s0, 4.0
277 ; GFX11-NEXT: global_store_b16 v[0:1], v2, off
278 ; GFX11-NEXT: s_endpgm
279 %load = load half, ptr addrspace(4) %arg, align 4
280 %add = fadd half %load, 4.0
281 store half %add, ptr addrspace(1) null
285 ; FIXME: valu usage on VI
286 define amdgpu_kernel void @widen_v2i8_constant_load(ptr addrspace(4) %arg) {
287 ; SI-LABEL: widen_v2i8_constant_load:
289 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
290 ; SI-NEXT: s_mov_b32 s3, 0xf000
291 ; SI-NEXT: s_mov_b32 s2, -1
292 ; SI-NEXT: s_waitcnt lgkmcnt(0)
293 ; SI-NEXT: s_load_dword s1, s[0:1], 0x0
294 ; SI-NEXT: s_mov_b32 s0, 0
295 ; SI-NEXT: s_waitcnt lgkmcnt(0)
296 ; SI-NEXT: s_and_b32 s4, s1, 0xff00
297 ; SI-NEXT: s_add_i32 s1, s1, 12
298 ; SI-NEXT: s_or_b32 s1, s1, 4
299 ; SI-NEXT: s_and_b32 s1, s1, 0xff
300 ; SI-NEXT: s_or_b32 s1, s4, s1
301 ; SI-NEXT: s_addk_i32 s1, 0x2c00
302 ; SI-NEXT: s_or_b32 s4, s1, 0x300
303 ; SI-NEXT: s_mov_b32 s1, s0
304 ; SI-NEXT: v_mov_b32_e32 v0, s4
305 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
308 ; VI-LABEL: widen_v2i8_constant_load:
310 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
311 ; VI-NEXT: v_mov_b32_e32 v0, 0
312 ; VI-NEXT: v_mov_b32_e32 v1, 0
313 ; VI-NEXT: s_waitcnt lgkmcnt(0)
314 ; VI-NEXT: s_load_dword s0, s[0:1], 0x0
315 ; VI-NEXT: s_waitcnt lgkmcnt(0)
316 ; VI-NEXT: s_and_b32 s1, s0, 0xffffff00
317 ; VI-NEXT: s_add_i32 s0, s0, 12
318 ; VI-NEXT: s_or_b32 s0, s0, 4
319 ; VI-NEXT: s_and_b32 s0, s0, 0xff
320 ; VI-NEXT: s_or_b32 s0, s1, s0
321 ; VI-NEXT: s_addk_i32 s0, 0x2c00
322 ; VI-NEXT: s_or_b32 s0, s0, 0x300
323 ; VI-NEXT: v_mov_b32_e32 v2, s0
324 ; VI-NEXT: flat_store_short v[0:1], v2
327 ; GFX11-LABEL: widen_v2i8_constant_load:
329 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
330 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
331 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
332 ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
333 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
334 ; GFX11-NEXT: s_add_i32 s1, s0, 12
335 ; GFX11-NEXT: s_and_b32 s0, s0, 0xff00
336 ; GFX11-NEXT: s_or_b32 s1, s1, 4
337 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
338 ; GFX11-NEXT: s_and_b32 s1, s1, 0xff
339 ; GFX11-NEXT: s_or_b32 s0, s0, s1
340 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
341 ; GFX11-NEXT: s_addk_i32 s0, 0x2c00
342 ; GFX11-NEXT: s_or_b32 s0, s0, 0x300
343 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
344 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
345 ; GFX11-NEXT: global_store_b16 v[0:1], v2, off
346 ; GFX11-NEXT: s_endpgm
347 %load = load <2 x i8>, ptr addrspace(4) %arg, align 4
348 %add = add <2 x i8> %load, <i8 12, i8 44>
349 %or = or <2 x i8> %add, <i8 4, i8 3>
350 store <2 x i8> %or, ptr addrspace(1) null
354 define amdgpu_kernel void @no_widen_i16_constant_divergent_load(ptr addrspace(4) %arg) {
355 ; SI-LABEL: no_widen_i16_constant_divergent_load:
357 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
358 ; SI-NEXT: s_mov_b32 s2, 0
359 ; SI-NEXT: s_mov_b32 s3, 0xf000
360 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
361 ; SI-NEXT: v_mov_b32_e32 v1, 0
362 ; SI-NEXT: s_waitcnt lgkmcnt(0)
363 ; SI-NEXT: buffer_load_ushort v0, v[0:1], s[0:3], 0 addr64
364 ; SI-NEXT: s_mov_b32 s6, -1
365 ; SI-NEXT: s_mov_b32 s4, s2
366 ; SI-NEXT: s_mov_b32 s5, s2
367 ; SI-NEXT: s_mov_b32 s7, s3
368 ; SI-NEXT: s_waitcnt vmcnt(0)
369 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x3e7, v0
370 ; SI-NEXT: v_or_b32_e32 v0, 4, v0
371 ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
374 ; VI-LABEL: no_widen_i16_constant_divergent_load:
376 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
377 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
378 ; VI-NEXT: s_waitcnt lgkmcnt(0)
379 ; VI-NEXT: v_mov_b32_e32 v1, s1
380 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
381 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
382 ; VI-NEXT: flat_load_ushort v0, v[0:1]
383 ; VI-NEXT: s_waitcnt vmcnt(0)
384 ; VI-NEXT: v_add_u16_e32 v2, 0x3e7, v0
385 ; VI-NEXT: v_mov_b32_e32 v0, 0
386 ; VI-NEXT: v_mov_b32_e32 v1, 0
387 ; VI-NEXT: v_or_b32_e32 v2, 4, v2
388 ; VI-NEXT: flat_store_short v[0:1], v2
391 ; GFX11-LABEL: no_widen_i16_constant_divergent_load:
393 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
394 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
395 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
396 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
397 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
398 ; GFX11-NEXT: global_load_u16 v0, v0, s[0:1]
399 ; GFX11-NEXT: s_waitcnt vmcnt(0)
400 ; GFX11-NEXT: v_add_nc_u16 v2, v0, 0x3e7
401 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
402 ; GFX11-NEXT: v_mov_b32_e32 v1, 0
403 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
404 ; GFX11-NEXT: v_or_b32_e32 v2, 4, v2
405 ; GFX11-NEXT: global_store_b16 v[0:1], v2, off
406 ; GFX11-NEXT: s_endpgm
407 %tid = call i32 @llvm.amdgcn.workitem.id.x()
408 %tid.ext = zext i32 %tid to i64
409 %gep.arg = getelementptr inbounds i16, ptr addrspace(4) %arg, i64 %tid.ext
410 %load = load i16, ptr addrspace(4) %gep.arg, align 4
411 %add = add i16 %load, 999
413 store i16 %or, ptr addrspace(1) null
417 define amdgpu_kernel void @widen_i1_constant_load(ptr addrspace(4) %arg) {
418 ; SI-LABEL: widen_i1_constant_load:
420 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
421 ; SI-NEXT: s_mov_b32 s3, 0xf000
422 ; SI-NEXT: s_mov_b32 s2, -1
423 ; SI-NEXT: s_waitcnt lgkmcnt(0)
424 ; SI-NEXT: s_load_dword s1, s[0:1], 0x0
425 ; SI-NEXT: s_mov_b32 s0, 0
426 ; SI-NEXT: s_waitcnt lgkmcnt(0)
427 ; SI-NEXT: s_and_b32 s4, s1, 1
428 ; SI-NEXT: s_mov_b32 s1, s0
429 ; SI-NEXT: v_mov_b32_e32 v0, s4
430 ; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0
433 ; VI-LABEL: widen_i1_constant_load:
435 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
436 ; VI-NEXT: v_mov_b32_e32 v0, 0
437 ; VI-NEXT: v_mov_b32_e32 v1, 0
438 ; VI-NEXT: s_waitcnt lgkmcnt(0)
439 ; VI-NEXT: s_load_dword s0, s[0:1], 0x0
440 ; VI-NEXT: s_waitcnt lgkmcnt(0)
441 ; VI-NEXT: s_and_b32 s0, s0, 1
442 ; VI-NEXT: v_mov_b32_e32 v2, s0
443 ; VI-NEXT: flat_store_byte v[0:1], v2
446 ; GFX11-LABEL: widen_i1_constant_load:
448 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
449 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
450 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
451 ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
452 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
453 ; GFX11-NEXT: s_and_b32 s0, s0, 1
454 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
455 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
456 ; GFX11-NEXT: global_store_b8 v[0:1], v2, off
457 ; GFX11-NEXT: s_endpgm
458 %load = load i1, ptr addrspace(4) %arg, align 4
459 %and = and i1 %load, true
460 store i1 %and, ptr addrspace(1) null
464 define amdgpu_kernel void @widen_i16_zextload_i64_constant_load(ptr addrspace(4) %arg) {
465 ; SI-LABEL: widen_i16_zextload_i64_constant_load:
467 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
468 ; SI-NEXT: s_mov_b32 s3, 0xf000
469 ; SI-NEXT: s_mov_b32 s2, -1
470 ; SI-NEXT: s_waitcnt lgkmcnt(0)
471 ; SI-NEXT: s_load_dword s1, s[0:1], 0x0
472 ; SI-NEXT: s_mov_b32 s0, 0
473 ; SI-NEXT: s_waitcnt lgkmcnt(0)
474 ; SI-NEXT: s_and_b32 s1, s1, 0xffff
475 ; SI-NEXT: s_addk_i32 s1, 0x3e7
476 ; SI-NEXT: s_or_b32 s4, s1, 4
477 ; SI-NEXT: s_mov_b32 s1, s0
478 ; SI-NEXT: v_mov_b32_e32 v0, s4
479 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
482 ; VI-LABEL: widen_i16_zextload_i64_constant_load:
484 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
485 ; VI-NEXT: v_mov_b32_e32 v0, 0
486 ; VI-NEXT: v_mov_b32_e32 v1, 0
487 ; VI-NEXT: s_waitcnt lgkmcnt(0)
488 ; VI-NEXT: s_load_dword s0, s[0:1], 0x0
489 ; VI-NEXT: s_waitcnt lgkmcnt(0)
490 ; VI-NEXT: s_and_b32 s0, s0, 0xffff
491 ; VI-NEXT: s_addk_i32 s0, 0x3e7
492 ; VI-NEXT: s_or_b32 s0, s0, 4
493 ; VI-NEXT: v_mov_b32_e32 v2, s0
494 ; VI-NEXT: flat_store_dword v[0:1], v2
497 ; GFX11-LABEL: widen_i16_zextload_i64_constant_load:
499 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
500 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
501 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
502 ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
503 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
504 ; GFX11-NEXT: s_and_b32 s0, s0, 0xffff
505 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
506 ; GFX11-NEXT: s_addk_i32 s0, 0x3e7
507 ; GFX11-NEXT: s_or_b32 s0, s0, 4
508 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
509 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
510 ; GFX11-NEXT: global_store_b32 v[0:1], v2, off
511 ; GFX11-NEXT: s_endpgm
512 %load = load i16, ptr addrspace(4) %arg, align 4
513 %zext = zext i16 %load to i32
514 %add = add i32 %zext, 999
516 store i32 %or, ptr addrspace(1) null
520 define amdgpu_kernel void @widen_i1_zext_to_i64_constant_load(ptr addrspace(4) %arg) {
521 ; SI-LABEL: widen_i1_zext_to_i64_constant_load:
523 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
524 ; SI-NEXT: s_mov_b32 s3, 0xf000
525 ; SI-NEXT: s_mov_b32 s2, -1
526 ; SI-NEXT: s_waitcnt lgkmcnt(0)
527 ; SI-NEXT: s_load_dword s1, s[0:1], 0x0
528 ; SI-NEXT: s_mov_b32 s0, 0
529 ; SI-NEXT: s_waitcnt lgkmcnt(0)
530 ; SI-NEXT: s_and_b32 s1, s1, 1
531 ; SI-NEXT: s_add_u32 s4, s1, 0x3e7
532 ; SI-NEXT: s_addc_u32 s5, 0, 0
533 ; SI-NEXT: v_mov_b32_e32 v0, s4
534 ; SI-NEXT: s_mov_b32 s1, s0
535 ; SI-NEXT: v_mov_b32_e32 v1, s5
536 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
539 ; VI-LABEL: widen_i1_zext_to_i64_constant_load:
541 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
542 ; VI-NEXT: v_mov_b32_e32 v0, 0
543 ; VI-NEXT: v_mov_b32_e32 v1, 0
544 ; VI-NEXT: s_waitcnt lgkmcnt(0)
545 ; VI-NEXT: s_load_dword s0, s[0:1], 0x0
546 ; VI-NEXT: s_waitcnt lgkmcnt(0)
547 ; VI-NEXT: s_and_b32 s0, s0, 1
548 ; VI-NEXT: s_add_u32 s0, s0, 0x3e7
549 ; VI-NEXT: s_addc_u32 s1, 0, 0
550 ; VI-NEXT: v_mov_b32_e32 v3, s1
551 ; VI-NEXT: v_mov_b32_e32 v2, s0
552 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
555 ; GFX11-LABEL: widen_i1_zext_to_i64_constant_load:
557 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
558 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
559 ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
560 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
561 ; GFX11-NEXT: s_and_b32 s0, s0, 1
562 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
563 ; GFX11-NEXT: s_add_u32 s0, s0, 0x3e7
564 ; GFX11-NEXT: s_addc_u32 s1, 0, 0
565 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s1
566 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
567 ; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off
568 ; GFX11-NEXT: s_endpgm
569 %load = load i1, ptr addrspace(4) %arg, align 4
570 %zext = zext i1 %load to i64
571 %add = add i64 %zext, 999
572 store i64 %add, ptr addrspace(1) null
576 define amdgpu_kernel void @widen_i16_constant32_load(ptr addrspace(6) %arg) {
577 ; SI-LABEL: widen_i16_constant32_load:
579 ; SI-NEXT: s_load_dword s0, s[4:5], 0x9
580 ; SI-NEXT: s_mov_b32 s1, 0
581 ; SI-NEXT: s_mov_b32 s3, 0xf000
582 ; SI-NEXT: s_mov_b32 s2, -1
583 ; SI-NEXT: s_waitcnt lgkmcnt(0)
584 ; SI-NEXT: s_load_dword s0, s[0:1], 0x0
585 ; SI-NEXT: s_waitcnt lgkmcnt(0)
586 ; SI-NEXT: s_addk_i32 s0, 0x3e7
587 ; SI-NEXT: s_or_b32 s4, s0, 4
588 ; SI-NEXT: s_mov_b32 s0, s1
589 ; SI-NEXT: v_mov_b32_e32 v0, s4
590 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
593 ; VI-LABEL: widen_i16_constant32_load:
595 ; VI-NEXT: s_load_dword s0, s[4:5], 0x24
596 ; VI-NEXT: s_mov_b32 s1, 0
597 ; VI-NEXT: v_mov_b32_e32 v0, 0
598 ; VI-NEXT: v_mov_b32_e32 v1, 0
599 ; VI-NEXT: s_waitcnt lgkmcnt(0)
600 ; VI-NEXT: s_load_dword s0, s[0:1], 0x0
601 ; VI-NEXT: s_waitcnt lgkmcnt(0)
602 ; VI-NEXT: s_addk_i32 s0, 0x3e7
603 ; VI-NEXT: s_or_b32 s0, s0, 4
604 ; VI-NEXT: v_mov_b32_e32 v2, s0
605 ; VI-NEXT: flat_store_short v[0:1], v2
608 ; GFX11-LABEL: widen_i16_constant32_load:
610 ; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24
611 ; GFX11-NEXT: s_mov_b32 s1, 0
612 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
613 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
614 ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
615 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
616 ; GFX11-NEXT: s_addk_i32 s0, 0x3e7
617 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
618 ; GFX11-NEXT: s_or_b32 s0, s0, 4
619 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
620 ; GFX11-NEXT: global_store_b16 v[0:1], v2, off
621 ; GFX11-NEXT: s_endpgm
622 %load = load i16, ptr addrspace(6) %arg, align 4
623 %add = add i16 %load, 999
625 store i16 %or, ptr addrspace(1) null
629 define amdgpu_kernel void @widen_i16_global_invariant_load(ptr addrspace(1) %arg) {
630 ; SI-LABEL: widen_i16_global_invariant_load:
632 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
633 ; SI-NEXT: s_mov_b32 s3, 0xf000
634 ; SI-NEXT: s_mov_b32 s2, -1
635 ; SI-NEXT: s_waitcnt lgkmcnt(0)
636 ; SI-NEXT: s_load_dword s1, s[0:1], 0x0
637 ; SI-NEXT: s_mov_b32 s0, 0
638 ; SI-NEXT: s_waitcnt lgkmcnt(0)
639 ; SI-NEXT: s_addk_i32 s1, 0x3e7
640 ; SI-NEXT: s_or_b32 s4, s1, 1
641 ; SI-NEXT: s_mov_b32 s1, s0
642 ; SI-NEXT: v_mov_b32_e32 v0, s4
643 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
646 ; VI-LABEL: widen_i16_global_invariant_load:
648 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
649 ; VI-NEXT: v_mov_b32_e32 v0, 0
650 ; VI-NEXT: v_mov_b32_e32 v1, 0
651 ; VI-NEXT: s_waitcnt lgkmcnt(0)
652 ; VI-NEXT: s_load_dword s0, s[0:1], 0x0
653 ; VI-NEXT: s_waitcnt lgkmcnt(0)
654 ; VI-NEXT: s_addk_i32 s0, 0x3e7
655 ; VI-NEXT: s_or_b32 s0, s0, 1
656 ; VI-NEXT: v_mov_b32_e32 v2, s0
657 ; VI-NEXT: flat_store_short v[0:1], v2
660 ; GFX11-LABEL: widen_i16_global_invariant_load:
662 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
663 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
664 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
665 ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
666 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
667 ; GFX11-NEXT: s_addk_i32 s0, 0x3e7
668 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
669 ; GFX11-NEXT: s_or_b32 s0, s0, 1
670 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
671 ; GFX11-NEXT: global_store_b16 v[0:1], v2, off
672 ; GFX11-NEXT: s_endpgm
673 %load = load i16, ptr addrspace(1) %arg, align 4, !invariant.load !0
674 %add = add i16 %load, 999
676 store i16 %or, ptr addrspace(1) null
680 declare i32 @llvm.amdgcn.workitem.id.x()