1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
3 ; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12 %s
4 ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12-GISEL %s
6 define amdgpu_cs void @mixed_vmem_types(i32 inreg %globalTable, i32 inreg %perShaderTable, i32 inreg %descTable0, i32 inreg %descTable1, <3 x i32> inreg %WorkgroupId, i32 inreg %MultiDispatchInfo, <3 x i32> %LocalInvocationId) #0 {
7 ; GFX11-LABEL: mixed_vmem_types:
8 ; GFX11: ; %bb.0: ; %.entry
9 ; GFX11-NEXT: s_getpc_b64 s[4:5]
10 ; GFX11-NEXT: s_mov_b32 s0, s3
11 ; GFX11-NEXT: s_mov_b32 s3, s5
12 ; GFX11-NEXT: s_mov_b32 s1, s5
13 ; GFX11-NEXT: s_load_b256 s[20:27], s[2:3], 0x40
14 ; GFX11-NEXT: s_load_b512 s[4:19], s[0:1], 0x0
15 ; GFX11-NEXT: s_load_b512 s[36:51], s[2:3], 0x0
16 ; GFX11-NEXT: v_mov_b32_e32 v0, 0xbc00bc00
17 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
18 ; GFX11-NEXT: buffer_load_b32 v1, off, s[20:23], 0
19 ; GFX11-NEXT: buffer_load_b32 v2, off, s[16:19], 0
20 ; GFX11-NEXT: image_sample_lz v3, v0, s[8:15], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
21 ; GFX11-NEXT: buffer_load_b32 v4, off, s[40:43], 0
22 ; GFX11-NEXT: image_sample_lz v0, v0, s[44:51], s[36:39] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
23 ; GFX11-NEXT: s_waitcnt vmcnt(4)
24 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0xac0, v1
25 ; GFX11-NEXT: s_waitcnt vmcnt(3)
26 ; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0xac0, v2
27 ; GFX11-NEXT: s_waitcnt vmcnt(2)
28 ; GFX11-NEXT: v_cmp_eq_f32_e64 s1, 1.0, v3
29 ; GFX11-NEXT: s_waitcnt vmcnt(1)
30 ; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0xac0, v4
31 ; GFX11-NEXT: s_and_b32 s0, s0, vcc_lo
32 ; GFX11-NEXT: s_waitcnt vmcnt(0)
33 ; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v0
34 ; GFX11-NEXT: s_and_b32 s0, s0, s1
35 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
36 ; GFX11-NEXT: s_and_b32 s0, s0, s2
37 ; GFX11-NEXT: s_and_b32 s0, s0, vcc_lo
38 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
39 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
40 ; GFX11-NEXT: buffer_store_b32 v0, off, s[24:27], 0
41 ; GFX11-NEXT: s_endpgm
43 ; GFX12-LABEL: mixed_vmem_types:
44 ; GFX12: ; %bb.0: ; %.entry
45 ; GFX12-NEXT: s_getpc_b64 s[4:5]
46 ; GFX12-NEXT: s_mov_b32 s0, s3
47 ; GFX12-NEXT: s_sext_i32_i16 s5, s5
48 ; GFX12-NEXT: v_mov_b32_e32 v0, 0xbc00bc00
49 ; GFX12-NEXT: s_mov_b32 s3, s5
50 ; GFX12-NEXT: s_mov_b32 s1, s5
51 ; GFX12-NEXT: s_load_b256 s[20:27], s[2:3], 0x40
52 ; GFX12-NEXT: s_load_b512 s[4:19], s[0:1], 0x0
53 ; GFX12-NEXT: s_load_b512 s[36:51], s[2:3], 0x0
54 ; GFX12-NEXT: s_wait_kmcnt 0x0
55 ; GFX12-NEXT: buffer_load_b32 v1, off, s[20:23], null
56 ; GFX12-NEXT: buffer_load_b32 v2, off, s[16:19], null
57 ; GFX12-NEXT: image_sample_lz v3, v0, s[8:15], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
58 ; GFX12-NEXT: buffer_load_b32 v4, off, s[40:43], null
59 ; GFX12-NEXT: image_sample_lz v0, v0, s[44:51], s[36:39] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
60 ; GFX12-NEXT: s_wait_loadcnt 0x2
61 ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0xac0, v1
62 ; GFX12-NEXT: s_wait_loadcnt 0x1
63 ; GFX12-NEXT: v_cmp_eq_u32_e64 s0, 0xac0, v2
64 ; GFX12-NEXT: s_wait_samplecnt 0x1
65 ; GFX12-NEXT: v_cmp_eq_f32_e64 s1, 1.0, v3
66 ; GFX12-NEXT: s_wait_loadcnt 0x0
67 ; GFX12-NEXT: v_cmp_eq_u32_e64 s2, 0xac0, v4
68 ; GFX12-NEXT: s_and_b32 s0, s0, vcc_lo
69 ; GFX12-NEXT: s_wait_samplecnt 0x0
70 ; GFX12-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v0
71 ; GFX12-NEXT: s_and_b32 s0, s0, s1
72 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
73 ; GFX12-NEXT: s_and_b32 s0, s0, s2
74 ; GFX12-NEXT: s_and_b32 s0, s0, vcc_lo
75 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
76 ; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
77 ; GFX12-NEXT: buffer_store_b32 v0, off, s[24:27], null
78 ; GFX12-NEXT: s_endpgm
80 ; GFX12-GISEL-LABEL: mixed_vmem_types:
81 ; GFX12-GISEL: ; %bb.0: ; %.entry
82 ; GFX12-GISEL-NEXT: s_getpc_b64 s[20:21]
83 ; GFX12-GISEL-NEXT: s_mov_b32 s0, s3
84 ; GFX12-GISEL-NEXT: s_sext_i32_i16 s21, s21
85 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0xbc00bc00
86 ; GFX12-GISEL-NEXT: s_mov_b32 s1, s21
87 ; GFX12-GISEL-NEXT: s_mov_b32 s3, s21
88 ; GFX12-GISEL-NEXT: s_load_b512 s[4:19], s[0:1], 0x0
89 ; GFX12-GISEL-NEXT: s_clause 0x1
90 ; GFX12-GISEL-NEXT: s_load_b256 s[20:27], s[2:3], 0x40
91 ; GFX12-GISEL-NEXT: s_load_b512 s[36:51], s[2:3], 0x0
92 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
93 ; GFX12-GISEL-NEXT: image_sample_lz v1, v0, s[8:15], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
94 ; GFX12-GISEL-NEXT: buffer_load_b32 v2, off, s[16:19], null
95 ; GFX12-GISEL-NEXT: buffer_load_b32 v3, off, s[20:23], null
96 ; GFX12-GISEL-NEXT: buffer_load_b32 v4, off, s[40:43], null
97 ; GFX12-GISEL-NEXT: image_sample_lz v0, v0, s[44:51], s[36:39] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
98 ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x2
99 ; GFX12-GISEL-NEXT: v_cmp_eq_u32_e64 s0, 0xac0, v2
100 ; GFX12-GISEL-NEXT: s_wait_samplecnt 0x1
101 ; GFX12-GISEL-NEXT: v_cmp_eq_f32_e32 vcc_lo, 1.0, v1
102 ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x1
103 ; GFX12-GISEL-NEXT: v_cmp_eq_u32_e64 s1, 0xac0, v3
104 ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
105 ; GFX12-GISEL-NEXT: v_cmp_eq_u32_e64 s2, 0xac0, v4
106 ; GFX12-GISEL-NEXT: s_and_b32 s0, s0, vcc_lo
107 ; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
108 ; GFX12-GISEL-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v0
109 ; GFX12-GISEL-NEXT: s_and_b32 s0, s0, s1
110 ; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
111 ; GFX12-GISEL-NEXT: s_and_b32 s0, s0, s2
112 ; GFX12-GISEL-NEXT: s_and_b32 s0, s0, vcc_lo
113 ; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
114 ; GFX12-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
115 ; GFX12-GISEL-NEXT: buffer_store_b32 v0, off, s[24:27], null
116 ; GFX12-GISEL-NEXT: s_endpgm
118 %i = call i64 @llvm.amdgcn.s.getpc()
119 %extelt.offset = lshr i64 %i, 32
120 %.i1 = trunc i64 %extelt.offset to i32
121 %.upto0 = insertelement <2 x i32> poison, i32 %descTable1, i64 0
122 %i1 = insertelement <2 x i32> %.upto0, i32 %.i1, i64 1
123 %i2 = bitcast <2 x i32> %i1 to i64
124 %i3 = inttoptr i64 %i2 to ptr addrspace(4)
125 %.upto03 = insertelement <2 x i32> poison, i32 %descTable0, i64 0
126 %i4 = insertelement <2 x i32> %.upto03, i32 %.i1, i64 1
127 %i5 = bitcast <2 x i32> %i4 to i64
128 %i6 = inttoptr i64 %i5 to ptr addrspace(4)
129 %i7 = getelementptr i8, ptr addrspace(4) %i6, i64 80
130 %i8 = load <4 x i32>, ptr addrspace(4) %i7, align 16
131 %i9 = getelementptr i8, ptr addrspace(4) %i3, i64 48
132 %i10 = load <4 x i32>, ptr addrspace(4) %i9, align 16
133 %i11 = getelementptr i8, ptr addrspace(4) %i6, i64 64
134 %i12 = load <4 x i32>, ptr addrspace(4) %i11, align 16
135 %i13 = getelementptr i8, ptr addrspace(4) %i6, i64 16
136 %i14 = load <4 x i32>, ptr addrspace(4) %i13, align 16
137 %i15 = getelementptr i8, ptr addrspace(4) %i6, i64 32
138 %i16 = load <8 x i32>, ptr addrspace(4) %i15, align 32
139 %i17 = load <4 x i32>, ptr addrspace(4) %i6, align 16
140 %i18 = call float @llvm.amdgcn.image.sample.lz.2d.f32.f16.v8i32.v4i32(i32 1, half 0xHBC00, half 0xHBC00, <8 x i32> %i16, <4 x i32> %i17, i1 false, i32 0, i32 0)
141 %i19 = fcmp oeq float %i18, 0.000000e+00
142 %i20 = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %i14, i32 0, i32 0, i32 0)
143 %.not = icmp eq i32 %i20, 2752
144 %i21 = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %i12, i32 0, i32 0, i32 0)
145 %.not1 = icmp eq i32 %i21, 2752
146 %i22 = getelementptr i8, ptr addrspace(4) %i3, i64 16
147 %i23 = load <8 x i32>, ptr addrspace(4) %i22, align 32
148 %i24 = load <4 x i32>, ptr addrspace(4) %i3, align 16
149 %i25 = call float @llvm.amdgcn.image.sample.lz.2d.f32.f16.v8i32.v4i32(i32 1, half 0xHBC00, half 0xHBC00, <8 x i32> %i23, <4 x i32> %i24, i1 false, i32 0, i32 0)
150 %i26 = fcmp oeq float %i25, 1.000000e+00
151 %i27 = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %i10, i32 0, i32 0, i32 0)
152 %.not2 = icmp eq i32 %i27, 2752
153 %i28 = select i1 %.not2, i1 %i26, i1 false
154 %i29 = select i1 %i28, i1 %.not1, i1 false
155 %i30 = select i1 %i29, i1 %.not, i1 false
156 %narrow2 = select i1 %i30, i1 %i19, i1 false
157 %.4 = zext i1 %narrow2 to i32
158 call void @llvm.amdgcn.raw.buffer.store.i32(i32 %.4, <4 x i32> %i8, i32 0, i32 0, i32 0)