1 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=bonaire < %s | FileCheck -check-prefixes=GCN,CIVI %s
2 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tonga < %s | FileCheck -check-prefixes=GCN,CIVI %s
3 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,CIVI,CIVI-HSA %s
4 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
5 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GCN,GFX10PLUS %s
6 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GCN,GFX10PLUS,GFX11 %s
8 ; GCN-LABEL: {{^}}store_flat_i32:
9 ; GCN-DAG: s_load_{{dwordx2|b64}} s[[[LO_SREG:[0-9]+]]:[[HI_SREG:[0-9]+]]],
10 ; GCN-DAG: s_load_{{dword|b32}} s[[SDATA:[0-9]+]],
11 ; GCN: s_waitcnt lgkmcnt(0)
12 ; GCN-DAG: v_mov_b32_e32 v[[DATA:[0-9]+]], s[[SDATA]]
13 ; GCN-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG]]
14 ; GCN-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], s[[HI_SREG]]
15 ; GCN: flat_store_{{dword|b32}} v[[[LO_VREG]]:[[HI_VREG]]], v[[DATA]]
16 define amdgpu_kernel void @store_flat_i32(ptr addrspace(1) %gptr, i32 %x) #0 {
17 %fptr = addrspacecast ptr addrspace(1) %gptr to ptr
18 store volatile i32 %x, ptr %fptr, align 4
22 ; GCN-LABEL: {{^}}store_flat_i64:
23 ; GCN: flat_store_{{dwordx2|b64}}
24 define amdgpu_kernel void @store_flat_i64(ptr addrspace(1) %gptr, i64 %x) #0 {
25 %fptr = addrspacecast ptr addrspace(1) %gptr to ptr
26 store volatile i64 %x, ptr %fptr, align 8
30 ; GCN-LABEL: {{^}}store_flat_v4i32:
31 ; GCN: flat_store_{{dwordx4|b128}}
32 define amdgpu_kernel void @store_flat_v4i32(ptr addrspace(1) %gptr, <4 x i32> %x) #0 {
33 %fptr = addrspacecast ptr addrspace(1) %gptr to ptr
34 store volatile <4 x i32> %x, ptr %fptr, align 16
38 ; GCN-LABEL: {{^}}store_flat_trunc_i16:
39 ; GCN: flat_store_{{short|b16}}
40 define amdgpu_kernel void @store_flat_trunc_i16(ptr addrspace(1) %gptr, i32 %x) #0 {
41 %fptr = addrspacecast ptr addrspace(1) %gptr to ptr
42 %y = trunc i32 %x to i16
43 store volatile i16 %y, ptr %fptr, align 2
47 ; GCN-LABEL: {{^}}store_flat_trunc_i8:
48 ; GCN: flat_store_{{byte|b8}}
49 define amdgpu_kernel void @store_flat_trunc_i8(ptr addrspace(1) %gptr, i32 %x) #0 {
50 %fptr = addrspacecast ptr addrspace(1) %gptr to ptr
51 %y = trunc i32 %x to i8
52 store volatile i8 %y, ptr %fptr, align 2
58 ; GCN-LABEL: load_flat_i32:
59 ; GCN: flat_load_{{dword|b32}}
60 define amdgpu_kernel void @load_flat_i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %gptr) #0 {
61 %fptr = addrspacecast ptr addrspace(1) %gptr to ptr
62 %fload = load volatile i32, ptr %fptr, align 4
63 store i32 %fload, ptr addrspace(1) %out, align 4
67 ; GCN-LABEL: load_flat_i64:
68 ; GCN: flat_load_{{dwordx2|b64}}
69 define amdgpu_kernel void @load_flat_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %gptr) #0 {
70 %fptr = addrspacecast ptr addrspace(1) %gptr to ptr
71 %fload = load volatile i64, ptr %fptr, align 8
72 store i64 %fload, ptr addrspace(1) %out, align 8
76 ; GCN-LABEL: load_flat_v4i32:
77 ; GCN: flat_load_{{dwordx4|b128}}
78 define amdgpu_kernel void @load_flat_v4i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %gptr) #0 {
79 %fptr = addrspacecast ptr addrspace(1) %gptr to ptr
80 %fload = load volatile <4 x i32>, ptr %fptr, align 32
81 store <4 x i32> %fload, ptr addrspace(1) %out, align 8
85 ; GCN-LABEL: sextload_flat_i8:
86 ; GCN: flat_load_{{sbyte|i8}}
87 define amdgpu_kernel void @sextload_flat_i8(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %gptr) #0 {
88 %fptr = addrspacecast ptr addrspace(1) %gptr to ptr
89 %fload = load volatile i8, ptr %fptr, align 4
90 %ext = sext i8 %fload to i32
91 store i32 %ext, ptr addrspace(1) %out, align 4
95 ; GCN-LABEL: zextload_flat_i8:
96 ; GCN: flat_load_{{ubyte|u8}}
97 define amdgpu_kernel void @zextload_flat_i8(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %gptr) #0 {
98 %fptr = addrspacecast ptr addrspace(1) %gptr to ptr
99 %fload = load volatile i8, ptr %fptr, align 4
100 %ext = zext i8 %fload to i32
101 store i32 %ext, ptr addrspace(1) %out, align 4
105 ; GCN-LABEL: sextload_flat_i16:
106 ; GCN: flat_load_{{sshort|i16}}
107 define amdgpu_kernel void @sextload_flat_i16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %gptr) #0 {
108 %fptr = addrspacecast ptr addrspace(1) %gptr to ptr
109 %fload = load volatile i16, ptr %fptr, align 4
110 %ext = sext i16 %fload to i32
111 store i32 %ext, ptr addrspace(1) %out, align 4
115 ; GCN-LABEL: zextload_flat_i16:
116 ; GCN: flat_load_{{ushort|u16}}
117 define amdgpu_kernel void @zextload_flat_i16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %gptr) #0 {
118 %fptr = addrspacecast ptr addrspace(1) %gptr to ptr
119 %fload = load volatile i16, ptr %fptr, align 4
120 %ext = zext i16 %fload to i32
121 store i32 %ext, ptr addrspace(1) %out, align 4
125 ; GCN-LABEL: flat_scratch_unaligned_load:
126 ; GCN: flat_load_{{ubyte|u8}}
127 ; GCN: flat_load_{{ubyte|u8}}
128 ; GCN: flat_load_{{ubyte|u8}}
129 ; GCN: flat_load_{{ubyte|u8}}
130 define amdgpu_kernel void @flat_scratch_unaligned_load() {
131 %scratch = alloca i32, addrspace(5)
132 %fptr = addrspacecast ptr addrspace(5) %scratch to ptr
133 store volatile ptr %fptr, ptr addrspace(3) null
134 %ld = load volatile i32, ptr %fptr, align 1
138 ; GCN-LABEL: flat_scratch_unaligned_store:
139 ; GCN: flat_store_{{byte|b8}}
140 ; GCN: flat_store_{{byte|b8}}
141 ; GCN: flat_store_{{byte|b8}}
142 ; GCN: flat_store_{{byte|b8}}
143 define amdgpu_kernel void @flat_scratch_unaligned_store() {
144 %scratch = alloca i32, addrspace(5)
145 %fptr = addrspacecast ptr addrspace(5) %scratch to ptr
146 store volatile ptr %fptr, ptr addrspace(3) null
147 store volatile i32 0, ptr %fptr, align 1
151 ; GCN-LABEL: flat_scratch_multidword_load_kernel:
152 ; CIVI-HSA: flat_load_dword v
153 ; CIVI-HSA: flat_load_dword v
154 ; GFX9: flat_load_dwordx2
155 ; GFX10PLUS: flat_load_{{dwordx2|b64}}
156 ; FIXME: These tests are broken for os = mesa3d, becasue it doesn't initialize flat_scr
157 define amdgpu_kernel void @flat_scratch_multidword_load_kernel() {
158 %scratch = alloca <2 x i32>, addrspace(5)
159 %fptr = addrspacecast ptr addrspace(5) %scratch to ptr
160 %ld = load volatile <2 x i32>, ptr %fptr
164 ; GCN-LABEL: flat_scratch_multidword_load_func:
165 ; CIVI-HSA: flat_load_dword v
166 ; CIVI-HSA: flat_load_dword v
167 ; GFX9: flat_load_dwordx2
168 ; GFX10PLUS: flat_load_{{dwordx2|b64}}
169 ; FIXME: These tests are broken for os = mesa3d, becasue it doesn't initialize flat_scr
170 define <2 x i32> @flat_scratch_multidword_load_func(ptr %maybe.scratch) {
171 %load = load <2 x i32>, ptr %maybe.scratch
175 ; GCN-LABEL: flat_scratch_multidword_store_kernel:
176 ; CIVI-HSA: flat_store_dword v
177 ; CIVI-HSA: flat_store_dword v
178 ; GFX9: flat_store_dwordx2
179 ; GFX10PLUS: flat_store_{{dwordx2|b64}}
180 ; FIXME: These tests are broken for os = mesa3d, becasue it doesn't initialize flat_scr
181 define amdgpu_kernel void @flat_scratch_multidword_store_kernel() {
182 %scratch = alloca <2 x i32>, addrspace(5)
183 %fptr = addrspacecast ptr addrspace(5) %scratch to ptr
184 store volatile <2 x i32> zeroinitializer, ptr %fptr
188 ; GCN-LABEL: flat_scratch_multidword_store_func:
189 ; CIVI-HSA: flat_store_dword v
190 ; CIVI-HSA: flat_store_dword v
191 ; GFX9: flat_store_dwordx2
192 ; GFX10PLUS: flat_store_{{dwordx2|b64}}
193 define void @flat_scratch_multidword_store_func(ptr %maybe.scratch) {
194 store <2 x i32> zeroinitializer, ptr %maybe.scratch
198 ; GCN-LABEL: {{^}}store_flat_i8_max_offset:
199 ; CIVI: flat_store_byte v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}{{$}}
200 ; GFX9: flat_store_byte v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:4095{{$}}
201 define amdgpu_kernel void @store_flat_i8_max_offset(ptr %fptr, i8 %x) #0 {
202 %fptr.offset = getelementptr inbounds i8, ptr %fptr, i64 4095
203 store volatile i8 %x, ptr %fptr.offset
207 ; GCN-LABEL: {{^}}store_flat_i8_max_offset_p1:
208 ; GCN: flat_store_{{byte|b8}} v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}{{( dlc)?}}{{$}}
209 define amdgpu_kernel void @store_flat_i8_max_offset_p1(ptr %fptr, i8 %x) #0 {
210 %fptr.offset = getelementptr inbounds i8, ptr %fptr, i64 4096
211 store volatile i8 %x, ptr %fptr.offset
215 ; GCN-LABEL: {{^}}store_flat_i8_neg_offset:
216 ; CIVI: flat_store_byte v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}{{$}}
218 ; GFX9: v_add_co_u32_e64 v{{[0-9]+}}, vcc, -2, s
219 ; GFX9: v_addc_co_u32_e32 v{{[0-9]+}}, vcc, -1,
220 ; GFX9: flat_store_byte v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}{{$}}
221 define amdgpu_kernel void @store_flat_i8_neg_offset(ptr %fptr, i8 %x) #0 {
222 %fptr.offset = getelementptr inbounds i8, ptr %fptr, i64 -2
223 store volatile i8 %x, ptr %fptr.offset
227 ; GCN-LABEL: {{^}}load_flat_i8_max_offset:
228 ; CIVI: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
229 ; GFX9: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} offset:4095 glc{{$}}
230 ; GFX10: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} glc dlc{{$}}
231 ; GFX11: flat_load_u8 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} offset:4095 glc dlc{{$}}
232 define amdgpu_kernel void @load_flat_i8_max_offset(ptr %fptr) #0 {
233 %fptr.offset = getelementptr inbounds i8, ptr %fptr, i64 4095
234 %val = load volatile i8, ptr %fptr.offset
238 ; GCN-LABEL: {{^}}load_flat_i8_max_offset_p1:
239 ; CIVI: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
240 ; GFX9: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
241 ; GFX10PLUS: flat_load_{{ubyte|u8}} v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} glc dlc{{$}}
242 define amdgpu_kernel void @load_flat_i8_max_offset_p1(ptr %fptr) #0 {
243 %fptr.offset = getelementptr inbounds i8, ptr %fptr, i64 4096
244 %val = load volatile i8, ptr %fptr.offset
248 ; GCN-LABEL: {{^}}load_flat_i8_neg_offset:
249 ; CIVI: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
251 ; GFX9: v_add_co_u32_e64 v{{[0-9]+}}, vcc, -2, s
252 ; GFX9: v_addc_co_u32_e32 v{{[0-9]+}}, vcc, -1,
253 ; GFX9: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
254 define amdgpu_kernel void @load_flat_i8_neg_offset(ptr %fptr) #0 {
255 %fptr.offset = getelementptr inbounds i8, ptr %fptr, i64 -2
256 %val = load volatile i8, ptr %fptr.offset
260 attributes #0 = { nounwind }
261 attributes #1 = { nounwind convergent }