1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,DAG %s
3 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -global-isel=1 < %s | FileCheck -check-prefixes=GCN,GISEL %s
5 define amdgpu_ps void @test_s_load_i8(ptr addrspace(4) inreg %in, ptr addrspace(1) %out) {
6 ; GCN-LABEL: test_s_load_i8:
8 ; GCN-NEXT: s_load_i8 s0, s[0:1], 0x0
9 ; GCN-NEXT: s_wait_kmcnt 0x0
10 ; GCN-NEXT: v_mov_b32_e32 v2, s0
11 ; GCN-NEXT: global_store_b32 v[0:1], v2, off
13 ; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
15 %ld = load i8, ptr addrspace(4) %in
16 %sext = sext i8 %ld to i32
17 store i32 %sext, ptr addrspace(1) %out
21 define amdgpu_ps void @test_s_load_i8_imm(ptr addrspace(4) inreg %in, ptr addrspace(1) %out) {
22 ; DAG-LABEL: test_s_load_i8_imm:
24 ; DAG-NEXT: s_movk_i32 s2, 0xff9c
25 ; DAG-NEXT: s_mov_b32 s3, -1
26 ; DAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
27 ; DAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3]
28 ; DAG-NEXT: s_load_i8 s0, s[0:1], 0x0
29 ; DAG-NEXT: s_wait_kmcnt 0x0
30 ; DAG-NEXT: v_mov_b32_e32 v2, s0
31 ; DAG-NEXT: global_store_b32 v[0:1], v2, off
33 ; DAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
36 ; GISEL-LABEL: test_s_load_i8_imm:
38 ; GISEL-NEXT: s_add_co_u32 s0, s0, 0xffffff9c
39 ; GISEL-NEXT: s_add_co_ci_u32 s1, s1, -1
40 ; GISEL-NEXT: s_load_i8 s0, s[0:1], 0x0
41 ; GISEL-NEXT: s_wait_kmcnt 0x0
42 ; GISEL-NEXT: v_mov_b32_e32 v2, s0
43 ; GISEL-NEXT: global_store_b32 v[0:1], v2, off
45 ; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
46 ; GISEL-NEXT: s_endpgm
47 %gep = getelementptr i8, ptr addrspace(4) %in, i64 -100
48 %ld = load i8, ptr addrspace(4) %gep
49 %sext = sext i8 %ld to i32
50 store i32 %sext, ptr addrspace(1) %out
54 define amdgpu_ps void @test_s_load_i8_sgpr(ptr addrspace(4) inreg %in, i32 inreg %offset, ptr addrspace(1) %out) {
55 ; GCN-LABEL: test_s_load_i8_sgpr:
57 ; GCN-NEXT: s_load_i8 s0, s[0:1], s2 offset:0x0
58 ; GCN-NEXT: s_wait_kmcnt 0x0
59 ; GCN-NEXT: v_mov_b32_e32 v2, s0
60 ; GCN-NEXT: global_store_b32 v[0:1], v2, off
62 ; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
64 %zext = zext i32 %offset to i64
65 %gep = getelementptr i8, ptr addrspace(4) %in, i64 %zext
66 %ld = load i8, ptr addrspace(4) %gep
67 %sext = sext i8 %ld to i32
68 store i32 %sext, ptr addrspace(1) %out
72 define amdgpu_ps void @test_s_load_i8_sgpr_imm(ptr addrspace(4) inreg %in, i32 inreg %offset, ptr addrspace(1) %out) {
73 ; GCN-LABEL: test_s_load_i8_sgpr_imm:
75 ; GCN-NEXT: s_load_i8 s0, s[0:1], s2 offset:0x10
76 ; GCN-NEXT: s_wait_kmcnt 0x0
77 ; GCN-NEXT: v_mov_b32_e32 v2, s0
78 ; GCN-NEXT: global_store_b32 v[0:1], v2, off
80 ; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
82 %gep1 = getelementptr i8, ptr addrspace(4) %in, i64 16
83 %zext = zext i32 %offset to i64
84 %gep2 = getelementptr i8, ptr addrspace(4) %gep1, i64 %zext
85 %ld = load i8, ptr addrspace(4) %gep2
86 %sext = sext i8 %ld to i32
87 store i32 %sext, ptr addrspace(1) %out
91 define amdgpu_ps void @test_s_load_i8_divergent(ptr addrspace(4) inreg %in, i32 %offset, ptr addrspace(1) %out) {
92 ; GCN-LABEL: test_s_load_i8_divergent:
94 ; GCN-NEXT: global_load_i8 v0, v0, s[0:1] offset:16
95 ; GCN-NEXT: s_wait_loadcnt 0x0
96 ; GCN-NEXT: global_store_b32 v[1:2], v0, off
98 ; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
100 %gep1 = getelementptr i8, ptr addrspace(4) %in, i64 16
101 %zext = zext i32 %offset to i64
102 %gep2 = getelementptr i8, ptr addrspace(4) %gep1, i64 %zext
103 %ld = load i8, ptr addrspace(4) %gep2
104 %sext = sext i8 %ld to i32
105 store i32 %sext, ptr addrspace(1) %out
109 define amdgpu_ps void @test_s_load_u8(ptr addrspace(4) inreg %in, ptr addrspace(1) %out) {
110 ; GCN-LABEL: test_s_load_u8:
112 ; GCN-NEXT: s_load_u8 s0, s[0:1], 0x0
113 ; GCN-NEXT: s_wait_kmcnt 0x0
114 ; GCN-NEXT: v_mov_b32_e32 v2, s0
115 ; GCN-NEXT: global_store_b32 v[0:1], v2, off
117 ; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
119 %ld = load i8, ptr addrspace(4) %in
120 %zext = zext i8 %ld to i32
121 store i32 %zext, ptr addrspace(1) %out
125 define amdgpu_ps void @test_s_load_u8_imm(ptr addrspace(4) inreg %in, ptr addrspace(1) %out) {
126 ; GCN-LABEL: test_s_load_u8_imm:
128 ; GCN-NEXT: s_load_u8 s0, s[0:1], 0xff
129 ; GCN-NEXT: s_wait_kmcnt 0x0
130 ; GCN-NEXT: v_mov_b32_e32 v2, s0
131 ; GCN-NEXT: global_store_b32 v[0:1], v2, off
133 ; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
135 %gep = getelementptr i8, ptr addrspace(4) %in, i64 255
136 %ld = load i8, ptr addrspace(4) %gep
137 %zext = zext i8 %ld to i32
138 store i32 %zext, ptr addrspace(1) %out
142 define amdgpu_ps void @test_s_load_u8_sgpr(ptr addrspace(4) inreg %in, i32 inreg %offset, ptr addrspace(1) %out) {
143 ; GCN-LABEL: test_s_load_u8_sgpr:
145 ; GCN-NEXT: s_load_u8 s0, s[0:1], s2 offset:0x0
146 ; GCN-NEXT: s_wait_kmcnt 0x0
147 ; GCN-NEXT: v_mov_b32_e32 v2, s0
148 ; GCN-NEXT: global_store_b32 v[0:1], v2, off
150 ; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
152 %zext1 = zext i32 %offset to i64
153 %gep = getelementptr i8, ptr addrspace(4) %in, i64 %zext1
154 %ld = load i8, ptr addrspace(4) %gep
155 %zext2 = zext i8 %ld to i32
156 store i32 %zext2, ptr addrspace(1) %out
160 define amdgpu_ps void @test_s_load_u8_sgpr_imm(ptr addrspace(4) inreg %in, i32 inreg %offset, ptr addrspace(1) %out) {
161 ; GCN-LABEL: test_s_load_u8_sgpr_imm:
163 ; GCN-NEXT: s_load_u8 s0, s[0:1], s2 offset:0x10
164 ; GCN-NEXT: s_wait_kmcnt 0x0
165 ; GCN-NEXT: v_mov_b32_e32 v2, s0
166 ; GCN-NEXT: global_store_b32 v[0:1], v2, off
168 ; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
170 %gep1 = getelementptr i8, ptr addrspace(4) %in, i64 16
171 %zext1= zext i32 %offset to i64
172 %gep2 = getelementptr i8, ptr addrspace(4) %gep1, i64 %zext1
173 %ld = load i8, ptr addrspace(4) %gep2
174 %zext2= zext i8 %ld to i32
175 store i32 %zext2, ptr addrspace(1) %out
179 define amdgpu_ps void @test_s_load_u8_divergent(ptr addrspace(4) inreg %in, i32 %offset, ptr addrspace(1) %out) {
180 ; GCN-LABEL: test_s_load_u8_divergent:
182 ; GCN-NEXT: global_load_u8 v0, v0, s[0:1] offset:16
183 ; GCN-NEXT: s_wait_loadcnt 0x0
184 ; GCN-NEXT: global_store_b32 v[1:2], v0, off
186 ; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
188 %gep1 = getelementptr i8, ptr addrspace(4) %in, i64 16
189 %zext1= zext i32 %offset to i64
190 %gep2 = getelementptr i8, ptr addrspace(4) %gep1, i64 %zext1
191 %ld = load i8, ptr addrspace(4) %gep2
192 %zext2= zext i8 %ld to i32
193 store i32 %zext2, ptr addrspace(1) %out
197 define amdgpu_ps void @test_s_load_i16(ptr addrspace(4) inreg %in, ptr addrspace(1) %out) {
198 ; GCN-LABEL: test_s_load_i16:
200 ; GCN-NEXT: s_load_i16 s0, s[0:1], 0x0
201 ; GCN-NEXT: s_wait_kmcnt 0x0
202 ; GCN-NEXT: v_mov_b32_e32 v2, s0
203 ; GCN-NEXT: global_store_b32 v[0:1], v2, off
205 ; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
207 %ld = load i16, ptr addrspace(4) %in
208 %sext = sext i16 %ld to i32
209 store i32 %sext, ptr addrspace(1) %out
213 define amdgpu_ps void @test_s_load_i16_imm(ptr addrspace(4) inreg %in, ptr addrspace(1) %out) {
214 ; DAG-LABEL: test_s_load_i16_imm:
216 ; DAG-NEXT: s_movk_i32 s2, 0xff38
217 ; DAG-NEXT: s_mov_b32 s3, -1
218 ; DAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
219 ; DAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3]
220 ; DAG-NEXT: s_load_i16 s0, s[0:1], 0x0
221 ; DAG-NEXT: s_wait_kmcnt 0x0
222 ; DAG-NEXT: v_mov_b32_e32 v2, s0
223 ; DAG-NEXT: global_store_b32 v[0:1], v2, off
225 ; DAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
228 ; GISEL-LABEL: test_s_load_i16_imm:
230 ; GISEL-NEXT: s_add_co_u32 s0, s0, 0xffffff38
231 ; GISEL-NEXT: s_add_co_ci_u32 s1, s1, -1
232 ; GISEL-NEXT: s_load_i16 s0, s[0:1], 0x0
233 ; GISEL-NEXT: s_wait_kmcnt 0x0
234 ; GISEL-NEXT: v_mov_b32_e32 v2, s0
235 ; GISEL-NEXT: global_store_b32 v[0:1], v2, off
236 ; GISEL-NEXT: s_nop 0
237 ; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
238 ; GISEL-NEXT: s_endpgm
239 %gep = getelementptr i16, ptr addrspace(4) %in, i64 -100
240 %ld = load i16, ptr addrspace(4) %gep
241 %sext = sext i16 %ld to i32
242 store i32 %sext, ptr addrspace(1) %out
246 define amdgpu_ps void @test_s_load_i16_sgpr(ptr addrspace(4) inreg %in, i32 inreg %offset, ptr addrspace(1) %out) {
247 ; GCN-LABEL: test_s_load_i16_sgpr:
249 ; GCN-NEXT: s_load_i16 s0, s[0:1], s2 offset:0x0
250 ; GCN-NEXT: s_wait_kmcnt 0x0
251 ; GCN-NEXT: v_mov_b32_e32 v2, s0
252 ; GCN-NEXT: global_store_b32 v[0:1], v2, off
254 ; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
256 %zext = zext i32 %offset to i64
257 %gep = getelementptr i8, ptr addrspace(4) %in, i64 %zext
258 %ld = load i16, ptr addrspace(4) %gep
259 %sext = sext i16 %ld to i32
260 store i32 %sext, ptr addrspace(1) %out
264 define amdgpu_ps void @test_s_load_i16_sgpr_imm(ptr addrspace(4) inreg %in, i32 inreg %offset, ptr addrspace(1) %out) {
265 ; DAG-LABEL: test_s_load_i16_sgpr_imm:
267 ; DAG-NEXT: s_mov_b32 s3, 0
268 ; DAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
269 ; DAG-NEXT: s_lshl_b64 s[2:3], s[2:3], 1
270 ; DAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3]
271 ; DAG-NEXT: s_load_i16 s0, s[0:1], 0x20
272 ; DAG-NEXT: s_wait_kmcnt 0x0
273 ; DAG-NEXT: v_mov_b32_e32 v2, s0
274 ; DAG-NEXT: global_store_b32 v[0:1], v2, off
276 ; DAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
279 ; GISEL-LABEL: test_s_load_i16_sgpr_imm:
281 ; GISEL-NEXT: s_mov_b32 s3, 0
282 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
283 ; GISEL-NEXT: s_lshl_b64 s[2:3], s[2:3], 1
284 ; GISEL-NEXT: s_add_co_u32 s0, s0, s2
285 ; GISEL-NEXT: s_add_co_ci_u32 s1, s1, s3
286 ; GISEL-NEXT: s_load_i16 s0, s[0:1], 0x20
287 ; GISEL-NEXT: s_wait_kmcnt 0x0
288 ; GISEL-NEXT: v_mov_b32_e32 v2, s0
289 ; GISEL-NEXT: global_store_b32 v[0:1], v2, off
290 ; GISEL-NEXT: s_nop 0
291 ; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
292 ; GISEL-NEXT: s_endpgm
293 %gep1 = getelementptr i16, ptr addrspace(4) %in, i64 16
294 %zext = zext i32 %offset to i64
295 %gep2 = getelementptr i16, ptr addrspace(4) %gep1, i64 %zext
296 %ld = load i16, ptr addrspace(4) %gep2
297 %sext = sext i16 %ld to i32
298 store i32 %sext, ptr addrspace(1) %out
302 define amdgpu_ps void @test_s_load_i16_divergent(ptr addrspace(4) inreg %in, i32 %offset, ptr addrspace(1) %out) {
303 ; DAG-LABEL: test_s_load_i16_divergent:
305 ; DAG-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, 0
306 ; DAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
307 ; DAG-NEXT: v_lshlrev_b64_e32 v[3:4], 1, v[3:4]
308 ; DAG-NEXT: v_add_co_u32 v3, vcc_lo, s0, v3
309 ; DAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
310 ; DAG-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, s1, v4, vcc_lo
311 ; DAG-NEXT: global_load_i16 v0, v[3:4], off offset:32
312 ; DAG-NEXT: s_wait_loadcnt 0x0
313 ; DAG-NEXT: global_store_b32 v[1:2], v0, off
315 ; DAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
318 ; GISEL-LABEL: test_s_load_i16_divergent:
320 ; GISEL-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v4, v2
321 ; GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v6, s1
322 ; GISEL-NEXT: v_mov_b32_e32 v5, s0
323 ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
324 ; GISEL-NEXT: v_lshlrev_b64_e32 v[0:1], 1, v[0:1]
325 ; GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v5, v0
326 ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
327 ; GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v6, v1, vcc_lo
328 ; GISEL-NEXT: global_load_i16 v0, v[0:1], off offset:32
329 ; GISEL-NEXT: s_wait_loadcnt 0x0
330 ; GISEL-NEXT: global_store_b32 v[3:4], v0, off
331 ; GISEL-NEXT: s_nop 0
332 ; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
333 ; GISEL-NEXT: s_endpgm
334 %gep1 = getelementptr i16, ptr addrspace(4) %in, i64 16
335 %zext = zext i32 %offset to i64
336 %gep2 = getelementptr i16, ptr addrspace(4) %gep1, i64 %zext
337 %ld = load i16, ptr addrspace(4) %gep2
338 %sext = sext i16 %ld to i32
339 store i32 %sext, ptr addrspace(1) %out
343 define amdgpu_ps void @test_s_load_u16(ptr addrspace(4) inreg %in, ptr addrspace(1) %out) {
344 ; GCN-LABEL: test_s_load_u16:
346 ; GCN-NEXT: s_load_u16 s0, s[0:1], 0x0
347 ; GCN-NEXT: s_wait_kmcnt 0x0
348 ; GCN-NEXT: v_mov_b32_e32 v2, s0
349 ; GCN-NEXT: global_store_b32 v[0:1], v2, off
351 ; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
353 %ld = load i16, ptr addrspace(4) %in
354 %zext = zext i16 %ld to i32
355 store i32 %zext, ptr addrspace(1) %out
359 define amdgpu_ps void @test_s_load_u16_imm(ptr addrspace(4) inreg %in, ptr addrspace(1) %out) {
360 ; GCN-LABEL: test_s_load_u16_imm:
362 ; GCN-NEXT: s_load_u16 s0, s[0:1], 0x1fe
363 ; GCN-NEXT: s_wait_kmcnt 0x0
364 ; GCN-NEXT: v_mov_b32_e32 v2, s0
365 ; GCN-NEXT: global_store_b32 v[0:1], v2, off
367 ; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
369 %gep = getelementptr i16, ptr addrspace(4) %in, i64 255
370 %ld = load i16, ptr addrspace(4) %gep
371 %zext = zext i16 %ld to i32
372 store i32 %zext, ptr addrspace(1) %out
376 define amdgpu_ps void @test_s_load_u16_sgpr(ptr addrspace(4) inreg %in, i32 inreg %offset, ptr addrspace(1) %out) {
377 ; GCN-LABEL: test_s_load_u16_sgpr:
379 ; GCN-NEXT: s_load_u16 s0, s[0:1], s2 offset:0x0
380 ; GCN-NEXT: s_wait_kmcnt 0x0
381 ; GCN-NEXT: v_mov_b32_e32 v2, s0
382 ; GCN-NEXT: global_store_b32 v[0:1], v2, off
384 ; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
386 %zext1 = zext i32 %offset to i64
387 %gep = getelementptr i8, ptr addrspace(4) %in, i64 %zext1
388 %ld = load i16, ptr addrspace(4) %gep
389 %zext2 = zext i16 %ld to i32
390 store i32 %zext2, ptr addrspace(1) %out
394 define amdgpu_ps void @test_s_load_u16_sgpr_imm(ptr addrspace(4) inreg %in, i32 inreg %offset, ptr addrspace(1) %out) {
395 ; DAG-LABEL: test_s_load_u16_sgpr_imm:
397 ; DAG-NEXT: s_mov_b32 s3, 0
398 ; DAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
399 ; DAG-NEXT: s_lshl_b64 s[2:3], s[2:3], 1
400 ; DAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3]
401 ; DAG-NEXT: s_load_u16 s0, s[0:1], 0x20
402 ; DAG-NEXT: s_wait_kmcnt 0x0
403 ; DAG-NEXT: v_mov_b32_e32 v2, s0
404 ; DAG-NEXT: global_store_b32 v[0:1], v2, off
406 ; DAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
409 ; GISEL-LABEL: test_s_load_u16_sgpr_imm:
411 ; GISEL-NEXT: s_mov_b32 s3, 0
412 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
413 ; GISEL-NEXT: s_lshl_b64 s[2:3], s[2:3], 1
414 ; GISEL-NEXT: s_add_co_u32 s0, s0, s2
415 ; GISEL-NEXT: s_add_co_ci_u32 s1, s1, s3
416 ; GISEL-NEXT: s_load_u16 s0, s[0:1], 0x20
417 ; GISEL-NEXT: s_wait_kmcnt 0x0
418 ; GISEL-NEXT: v_mov_b32_e32 v2, s0
419 ; GISEL-NEXT: global_store_b32 v[0:1], v2, off
420 ; GISEL-NEXT: s_nop 0
421 ; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
422 ; GISEL-NEXT: s_endpgm
423 %gep1 = getelementptr i16, ptr addrspace(4) %in, i64 16
424 %zext1= zext i32 %offset to i64
425 %gep2 = getelementptr i16, ptr addrspace(4) %gep1, i64 %zext1
426 %ld = load i16, ptr addrspace(4) %gep2
427 %zext2= zext i16 %ld to i32
428 store i32 %zext2, ptr addrspace(1) %out
432 define amdgpu_ps void @test_s_load_u16_divergent(ptr addrspace(4) inreg %in, i32 %offset, ptr addrspace(1) %out) {
433 ; DAG-LABEL: test_s_load_u16_divergent:
435 ; DAG-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, 0
436 ; DAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
437 ; DAG-NEXT: v_lshlrev_b64_e32 v[3:4], 1, v[3:4]
438 ; DAG-NEXT: v_add_co_u32 v3, vcc_lo, s0, v3
439 ; DAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
440 ; DAG-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, s1, v4, vcc_lo
441 ; DAG-NEXT: global_load_u16 v0, v[3:4], off offset:32
442 ; DAG-NEXT: s_wait_loadcnt 0x0
443 ; DAG-NEXT: global_store_b32 v[1:2], v0, off
445 ; DAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
448 ; GISEL-LABEL: test_s_load_u16_divergent:
450 ; GISEL-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v4, v2
451 ; GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v6, s1
452 ; GISEL-NEXT: v_mov_b32_e32 v5, s0
453 ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
454 ; GISEL-NEXT: v_lshlrev_b64_e32 v[0:1], 1, v[0:1]
455 ; GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v5, v0
456 ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
457 ; GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v6, v1, vcc_lo
458 ; GISEL-NEXT: global_load_u16 v0, v[0:1], off offset:32
459 ; GISEL-NEXT: s_wait_loadcnt 0x0
460 ; GISEL-NEXT: global_store_b32 v[3:4], v0, off
461 ; GISEL-NEXT: s_nop 0
462 ; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
463 ; GISEL-NEXT: s_endpgm
464 %gep1 = getelementptr i16, ptr addrspace(4) %in, i64 16
465 %zext1= zext i32 %offset to i64
466 %gep2 = getelementptr i16, ptr addrspace(4) %gep1, i64 %zext1
467 %ld = load i16, ptr addrspace(4) %gep2
468 %zext2= zext i16 %ld to i32
469 store i32 %zext2, ptr addrspace(1) %out
473 define amdgpu_ps void @s_buffer_load_byte_imm_offset(<4 x i32> inreg %src, ptr addrspace(1) nocapture %out) {
474 ; GCN-LABEL: s_buffer_load_byte_imm_offset:
475 ; GCN: ; %bb.0: ; %main_body
476 ; GCN-NEXT: s_buffer_load_i8 s0, s[0:3], 0x4
477 ; GCN-NEXT: s_wait_kmcnt 0x0
478 ; GCN-NEXT: v_mov_b32_e32 v2, s0
479 ; GCN-NEXT: global_store_b32 v[0:1], v2, off
481 ; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
484 %ld = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> %src, i32 4, i32 0)
485 %sext = sext i8 %ld to i32
486 store i32 %sext, ptr addrspace(1) %out
490 define amdgpu_ps void @s_buffer_load_byte_sgpr(<4 x i32> inreg %src, ptr addrspace(1) nocapture %out, i32 inreg %offset) {
491 ; GCN-LABEL: s_buffer_load_byte_sgpr:
492 ; GCN: ; %bb.0: ; %main_body
493 ; GCN-NEXT: s_buffer_load_i8 s0, s[0:3], s4 offset:0x0
494 ; GCN-NEXT: s_wait_kmcnt 0x0
495 ; GCN-NEXT: v_mov_b32_e32 v2, s0
496 ; GCN-NEXT: global_store_b32 v[0:1], v2, off
498 ; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
501 %ld = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> %src, i32 %offset, i32 0)
502 %sext = sext i8 %ld to i32
503 store i32 %sext, ptr addrspace(1) %out
507 define amdgpu_ps void @s_buffer_load_byte_sgpr_or_imm_offset(<4 x i32> inreg %src, ptr addrspace(1) nocapture %out, i32 inreg %in) {
508 ; GCN-LABEL: s_buffer_load_byte_sgpr_or_imm_offset:
509 ; GCN: ; %bb.0: ; %main_body
510 ; GCN-NEXT: s_buffer_load_i8 s0, s[0:3], s4 offset:0x64
511 ; GCN-NEXT: s_wait_kmcnt 0x0
512 ; GCN-NEXT: v_mov_b32_e32 v2, s0
513 ; GCN-NEXT: global_store_b32 v[0:1], v2, off
515 ; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
518 %off = add nuw nsw i32 %in, 100
519 %ld = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> %src, i32 %off, i32 0)
520 %sext = sext i8 %ld to i32
521 store i32 %sext, ptr addrspace(1) %out
525 define amdgpu_ps void @s_buffer_load_byte_sgpr_or_imm_offset_divergent(<4 x i32> inreg %src, ptr addrspace(1) nocapture %out, i32 %offset) {
526 ; DAG-LABEL: s_buffer_load_byte_sgpr_or_imm_offset_divergent:
527 ; DAG: ; %bb.0: ; %main_body
528 ; DAG-NEXT: buffer_load_i8 v2, v2, s[0:3], null offen
529 ; DAG-NEXT: s_wait_loadcnt 0x0
530 ; DAG-NEXT: global_store_b32 v[0:1], v2, off
532 ; DAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
535 ; GISEL-LABEL: s_buffer_load_byte_sgpr_or_imm_offset_divergent:
536 ; GISEL: ; %bb.0: ; %main_body
537 ; GISEL-NEXT: buffer_load_b32 v2, v2, s[0:3], null offen
538 ; GISEL-NEXT: s_wait_loadcnt 0x0
539 ; GISEL-NEXT: global_store_b32 v[0:1], v2, off
540 ; GISEL-NEXT: s_nop 0
541 ; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
542 ; GISEL-NEXT: s_endpgm
544 %ld = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> %src, i32 %offset, i32 0)
545 %sext = sext i8 %ld to i32
546 store i32 %sext, ptr addrspace(1) %out
550 define amdgpu_ps void @s_buffer_load_ubyte_imm_offset(<4 x i32> inreg %src, ptr addrspace(1) nocapture %out) {
551 ; GCN-LABEL: s_buffer_load_ubyte_imm_offset:
552 ; GCN: ; %bb.0: ; %main_body
553 ; GCN-NEXT: s_buffer_load_u8 s0, s[0:3], 0x4
554 ; GCN-NEXT: s_wait_kmcnt 0x0
555 ; GCN-NEXT: s_and_b32 s0, s0, 0xff
556 ; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
557 ; GCN-NEXT: v_mov_b32_e32 v2, s0
558 ; GCN-NEXT: global_store_b32 v[0:1], v2, off
560 ; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
563 %ld = call i8 @llvm.amdgcn.s.buffer.load.u8(<4 x i32> %src, i32 4, i32 0)
564 %zext = zext i8 %ld to i32
565 store i32 %zext, ptr addrspace(1) %out
569 define amdgpu_ps void @s_buffer_load_ubyte_sgpr(<4 x i32> inreg %src, ptr addrspace(1) nocapture %out, i32 inreg %offset) {
570 ; GCN-LABEL: s_buffer_load_ubyte_sgpr:
571 ; GCN: ; %bb.0: ; %main_body
572 ; GCN-NEXT: s_buffer_load_u8 s0, s[0:3], s4 offset:0x0
573 ; GCN-NEXT: s_wait_kmcnt 0x0
574 ; GCN-NEXT: s_and_b32 s0, s0, 0xff
575 ; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
576 ; GCN-NEXT: v_mov_b32_e32 v2, s0
577 ; GCN-NEXT: global_store_b32 v[0:1], v2, off
579 ; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
582 %ld = call i8 @llvm.amdgcn.s.buffer.load.u8(<4 x i32> %src, i32 %offset, i32 0)
583 %zext = zext i8 %ld to i32
584 store i32 %zext, ptr addrspace(1) %out
588 define amdgpu_ps void @s_buffer_load_ubyte_sgpr_or_imm_offset(<4 x i32> inreg %src, ptr addrspace(1) nocapture %out, i32 inreg %in) {
589 ; GCN-LABEL: s_buffer_load_ubyte_sgpr_or_imm_offset:
590 ; GCN: ; %bb.0: ; %main_body
591 ; GCN-NEXT: s_buffer_load_u8 s0, s[0:3], s4 offset:0x64
592 ; GCN-NEXT: s_wait_kmcnt 0x0
593 ; GCN-NEXT: s_and_b32 s0, s0, 0xff
594 ; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
595 ; GCN-NEXT: v_mov_b32_e32 v2, s0
596 ; GCN-NEXT: global_store_b32 v[0:1], v2, off
598 ; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
601 %off = add nuw nsw i32 %in, 100
602 %ld = call i8 @llvm.amdgcn.s.buffer.load.u8(<4 x i32> %src, i32 %off, i32 0)
603 %zext = zext i8 %ld to i32
604 store i32 %zext, ptr addrspace(1) %out
608 define amdgpu_ps void @s_buffer_load_ubyte_sgpr_or_imm_offset_divergent(<4 x i32> inreg %src, ptr addrspace(1) nocapture %out, i32 %offset) {
609 ; DAG-LABEL: s_buffer_load_ubyte_sgpr_or_imm_offset_divergent:
610 ; DAG: ; %bb.0: ; %main_body
611 ; DAG-NEXT: buffer_load_u8 v2, v2, s[0:3], null offen
612 ; DAG-NEXT: s_wait_loadcnt 0x0
613 ; DAG-NEXT: global_store_b32 v[0:1], v2, off
615 ; DAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
618 ; GISEL-LABEL: s_buffer_load_ubyte_sgpr_or_imm_offset_divergent:
619 ; GISEL: ; %bb.0: ; %main_body
620 ; GISEL-NEXT: buffer_load_b32 v2, v2, s[0:3], null offen
621 ; GISEL-NEXT: s_wait_loadcnt 0x0
622 ; GISEL-NEXT: v_and_b32_e32 v2, 0xff, v2
623 ; GISEL-NEXT: global_store_b32 v[0:1], v2, off
624 ; GISEL-NEXT: s_nop 0
625 ; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
626 ; GISEL-NEXT: s_endpgm
628 %ld = call i8 @llvm.amdgcn.s.buffer.load.u8(<4 x i32> %src, i32 %offset, i32 0)
629 %zext = zext i8 %ld to i32
630 store i32 %zext, ptr addrspace(1) %out
634 define amdgpu_ps void @s_buffer_load_short_imm_offset(<4 x i32> inreg %src, ptr addrspace(1) nocapture %out) {
635 ; GCN-LABEL: s_buffer_load_short_imm_offset:
636 ; GCN: ; %bb.0: ; %main_body
637 ; GCN-NEXT: s_buffer_load_i16 s0, s[0:3], 0x4
638 ; GCN-NEXT: s_wait_kmcnt 0x0
639 ; GCN-NEXT: v_mov_b32_e32 v2, s0
640 ; GCN-NEXT: global_store_b32 v[0:1], v2, off
642 ; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
645 %ld = call i16 @llvm.amdgcn.s.buffer.load.i16(<4 x i32> %src, i32 4, i32 0)
646 %sext = sext i16 %ld to i32
647 store i32 %sext, ptr addrspace(1) %out
651 define amdgpu_ps void @s_buffer_load_short_sgpr(<4 x i32> inreg %src, ptr addrspace(1) nocapture %out, i32 inreg %offset) {
652 ; GCN-LABEL: s_buffer_load_short_sgpr:
653 ; GCN: ; %bb.0: ; %main_body
654 ; GCN-NEXT: s_buffer_load_i16 s0, s[0:3], s4 offset:0x0
655 ; GCN-NEXT: s_wait_kmcnt 0x0
656 ; GCN-NEXT: v_mov_b32_e32 v2, s0
657 ; GCN-NEXT: global_store_b32 v[0:1], v2, off
659 ; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
662 %ld = call i16 @llvm.amdgcn.s.buffer.load.i16(<4 x i32> %src, i32 %offset, i32 0)
663 %sext = sext i16 %ld to i32
664 store i32 %sext, ptr addrspace(1) %out
668 define amdgpu_ps void @s_buffer_load_short_sgpr_or_imm_offset(<4 x i32> inreg %src, ptr addrspace(1) nocapture %out, i32 inreg %in) {
669 ; GCN-LABEL: s_buffer_load_short_sgpr_or_imm_offset:
670 ; GCN: ; %bb.0: ; %main_body
671 ; GCN-NEXT: s_buffer_load_i16 s0, s[0:3], s4 offset:0x64
672 ; GCN-NEXT: s_wait_kmcnt 0x0
673 ; GCN-NEXT: v_mov_b32_e32 v2, s0
674 ; GCN-NEXT: global_store_b32 v[0:1], v2, off
676 ; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
679 %off = add nuw nsw i32 %in, 100
680 %ld = call i16 @llvm.amdgcn.s.buffer.load.i16(<4 x i32> %src, i32 %off, i32 0)
681 %sext = sext i16 %ld to i32
682 store i32 %sext, ptr addrspace(1) %out
686 define amdgpu_ps void @s_buffer_load_short_sgpr_or_imm_offset_divergent(<4 x i32> inreg %src, ptr addrspace(1) nocapture %out, i32 %offset) {
687 ; DAG-LABEL: s_buffer_load_short_sgpr_or_imm_offset_divergent:
688 ; DAG: ; %bb.0: ; %main_body
689 ; DAG-NEXT: buffer_load_i16 v2, v2, s[0:3], null offen
690 ; DAG-NEXT: s_wait_loadcnt 0x0
691 ; DAG-NEXT: global_store_b32 v[0:1], v2, off
693 ; DAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
696 ; GISEL-LABEL: s_buffer_load_short_sgpr_or_imm_offset_divergent:
697 ; GISEL: ; %bb.0: ; %main_body
698 ; GISEL-NEXT: buffer_load_b32 v2, v2, s[0:3], null offen
699 ; GISEL-NEXT: s_wait_loadcnt 0x0
700 ; GISEL-NEXT: global_store_b32 v[0:1], v2, off
701 ; GISEL-NEXT: s_nop 0
702 ; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
703 ; GISEL-NEXT: s_endpgm
705 %ld = call i16 @llvm.amdgcn.s.buffer.load.i16(<4 x i32> %src, i32 %offset, i32 0)
706 %sext = sext i16 %ld to i32
707 store i32 %sext, ptr addrspace(1) %out
711 define amdgpu_ps void @s_buffer_load_ushort_imm_offset(<4 x i32> inreg %src, ptr addrspace(1) nocapture %out) {
712 ; GCN-LABEL: s_buffer_load_ushort_imm_offset:
713 ; GCN: ; %bb.0: ; %main_body
714 ; GCN-NEXT: s_buffer_load_u16 s0, s[0:3], 0x4
715 ; GCN-NEXT: s_wait_kmcnt 0x0
716 ; GCN-NEXT: s_and_b32 s0, s0, 0xffff
717 ; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
718 ; GCN-NEXT: v_mov_b32_e32 v2, s0
719 ; GCN-NEXT: global_store_b32 v[0:1], v2, off
721 ; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
724 %ld = call i16 @llvm.amdgcn.s.buffer.load.u16(<4 x i32> %src, i32 4, i32 0)
725 %zext = zext i16 %ld to i32
726 store i32 %zext, ptr addrspace(1) %out
730 define amdgpu_ps void @s_buffer_load_ushort_sgpr(<4 x i32> inreg %src, ptr addrspace(1) nocapture %out, i32 inreg %offset) {
731 ; GCN-LABEL: s_buffer_load_ushort_sgpr:
732 ; GCN: ; %bb.0: ; %main_body
733 ; GCN-NEXT: s_buffer_load_u16 s0, s[0:3], s4 offset:0x0
734 ; GCN-NEXT: s_wait_kmcnt 0x0
735 ; GCN-NEXT: s_and_b32 s0, s0, 0xffff
736 ; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
737 ; GCN-NEXT: v_mov_b32_e32 v2, s0
738 ; GCN-NEXT: global_store_b32 v[0:1], v2, off
740 ; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
743 %ld = call i16 @llvm.amdgcn.s.buffer.load.u16(<4 x i32> %src, i32 %offset, i32 0)
744 %zext = zext i16 %ld to i32
745 store i32 %zext, ptr addrspace(1) %out
749 define amdgpu_ps void @s_buffer_load_ushort_sgpr_or_imm_offset(<4 x i32> inreg %src, ptr addrspace(1) nocapture %out, i32 inreg %in) {
750 ; GCN-LABEL: s_buffer_load_ushort_sgpr_or_imm_offset:
751 ; GCN: ; %bb.0: ; %main_body
752 ; GCN-NEXT: s_buffer_load_u16 s0, s[0:3], s4 offset:0x64
753 ; GCN-NEXT: s_wait_kmcnt 0x0
754 ; GCN-NEXT: s_and_b32 s0, s0, 0xffff
755 ; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
756 ; GCN-NEXT: v_mov_b32_e32 v2, s0
757 ; GCN-NEXT: global_store_b32 v[0:1], v2, off
759 ; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
762 %off = add nuw nsw i32 %in, 100
763 %ld = call i16 @llvm.amdgcn.s.buffer.load.u16(<4 x i32> %src, i32 %off, i32 0)
764 %zext = zext i16 %ld to i32
765 store i32 %zext, ptr addrspace(1) %out
769 define amdgpu_ps void @s_buffer_load_ushort_sgpr_or_imm_offset_divergent(<4 x i32> inreg %src, ptr addrspace(1) nocapture %out, i32 %offset) {
770 ; DAG-LABEL: s_buffer_load_ushort_sgpr_or_imm_offset_divergent:
771 ; DAG: ; %bb.0: ; %main_body
772 ; DAG-NEXT: buffer_load_u16 v2, v2, s[0:3], null offen
773 ; DAG-NEXT: s_wait_loadcnt 0x0
774 ; DAG-NEXT: global_store_b32 v[0:1], v2, off
776 ; DAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
779 ; GISEL-LABEL: s_buffer_load_ushort_sgpr_or_imm_offset_divergent:
780 ; GISEL: ; %bb.0: ; %main_body
781 ; GISEL-NEXT: buffer_load_b32 v2, v2, s[0:3], null offen
782 ; GISEL-NEXT: s_wait_loadcnt 0x0
783 ; GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2
784 ; GISEL-NEXT: global_store_b32 v[0:1], v2, off
785 ; GISEL-NEXT: s_nop 0
786 ; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
787 ; GISEL-NEXT: s_endpgm
789 %ld = call i16 @llvm.amdgcn.s.buffer.load.u16(<4 x i32> %src, i32 %offset, i32 0)
790 %zext = zext i16 %ld to i32
791 store i32 %zext, ptr addrspace(1) %out
795 declare i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32>, i32, i32)
796 declare i8 @llvm.amdgcn.s.buffer.load.u8(<4 x i32>, i32, i32)
797 declare i16 @llvm.amdgcn.s.buffer.load.i16(<4 x i32>, i32, i32)
798 declare i16 @llvm.amdgcn.s.buffer.load.u16(<4 x i32>, i32, i32)