1 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=bonaire < %s | FileCheck -check-prefixes=CHECK,CIVI %s
2 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefixes=CHECK,CIVI %s
3 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-flat-for-global < %s | FileCheck -check-prefixes=CHECK,CIVI,HSA %s
4 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=CHECK,HSA,GFX9 %s
6 ; CHECK-LABEL: {{^}}store_flat_i32:
7 ; CHECK-DAG: s_load_dwordx2 s{{\[}}[[LO_SREG:[0-9]+]]:[[HI_SREG:[0-9]+]]],
8 ; CHECK-DAG: s_load_dword s[[SDATA:[0-9]+]],
9 ; CHECK: s_waitcnt lgkmcnt(0)
10 ; CHECK-DAG: v_mov_b32_e32 v[[DATA:[0-9]+]], s[[SDATA]]
11 ; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG]]
12 ; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], s[[HI_SREG]]
13 ; CHECK: flat_store_dword v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}, v[[DATA]]
14 define amdgpu_kernel void @store_flat_i32(i32 addrspace(1)* %gptr, i32 %x) #0 {
15 %fptr = addrspacecast i32 addrspace(1)* %gptr to i32*
16 store volatile i32 %x, i32* %fptr, align 4
20 ; CHECK-LABEL: {{^}}store_flat_i64:
21 ; CHECK: flat_store_dwordx2
22 define amdgpu_kernel void @store_flat_i64(i64 addrspace(1)* %gptr, i64 %x) #0 {
23 %fptr = addrspacecast i64 addrspace(1)* %gptr to i64*
24 store volatile i64 %x, i64* %fptr, align 8
28 ; CHECK-LABEL: {{^}}store_flat_v4i32:
29 ; CHECK: flat_store_dwordx4
30 define amdgpu_kernel void @store_flat_v4i32(<4 x i32> addrspace(1)* %gptr, <4 x i32> %x) #0 {
31 %fptr = addrspacecast <4 x i32> addrspace(1)* %gptr to <4 x i32>*
32 store volatile <4 x i32> %x, <4 x i32>* %fptr, align 16
36 ; CHECK-LABEL: {{^}}store_flat_trunc_i16:
37 ; CHECK: flat_store_short
38 define amdgpu_kernel void @store_flat_trunc_i16(i16 addrspace(1)* %gptr, i32 %x) #0 {
39 %fptr = addrspacecast i16 addrspace(1)* %gptr to i16*
40 %y = trunc i32 %x to i16
41 store volatile i16 %y, i16* %fptr, align 2
45 ; CHECK-LABEL: {{^}}store_flat_trunc_i8:
46 ; CHECK: flat_store_byte
47 define amdgpu_kernel void @store_flat_trunc_i8(i8 addrspace(1)* %gptr, i32 %x) #0 {
48 %fptr = addrspacecast i8 addrspace(1)* %gptr to i8*
49 %y = trunc i32 %x to i8
50 store volatile i8 %y, i8* %fptr, align 2
56 ; CHECK-LABEL: load_flat_i32:
57 ; CHECK: flat_load_dword
58 define amdgpu_kernel void @load_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %gptr) #0 {
59 %fptr = addrspacecast i32 addrspace(1)* %gptr to i32*
60 %fload = load volatile i32, i32* %fptr, align 4
61 store i32 %fload, i32 addrspace(1)* %out, align 4
65 ; CHECK-LABEL: load_flat_i64:
66 ; CHECK: flat_load_dwordx2
67 define amdgpu_kernel void @load_flat_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %gptr) #0 {
68 %fptr = addrspacecast i64 addrspace(1)* %gptr to i64*
69 %fload = load volatile i64, i64* %fptr, align 8
70 store i64 %fload, i64 addrspace(1)* %out, align 8
74 ; CHECK-LABEL: load_flat_v4i32:
75 ; CHECK: flat_load_dwordx4
76 define amdgpu_kernel void @load_flat_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %gptr) #0 {
77 %fptr = addrspacecast <4 x i32> addrspace(1)* %gptr to <4 x i32>*
78 %fload = load volatile <4 x i32>, <4 x i32>* %fptr, align 32
79 store <4 x i32> %fload, <4 x i32> addrspace(1)* %out, align 8
83 ; CHECK-LABEL: sextload_flat_i8:
84 ; CHECK: flat_load_sbyte
85 define amdgpu_kernel void @sextload_flat_i8(i32 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %gptr) #0 {
86 %fptr = addrspacecast i8 addrspace(1)* %gptr to i8*
87 %fload = load volatile i8, i8* %fptr, align 4
88 %ext = sext i8 %fload to i32
89 store i32 %ext, i32 addrspace(1)* %out, align 4
93 ; CHECK-LABEL: zextload_flat_i8:
94 ; CHECK: flat_load_ubyte
95 define amdgpu_kernel void @zextload_flat_i8(i32 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %gptr) #0 {
96 %fptr = addrspacecast i8 addrspace(1)* %gptr to i8*
97 %fload = load volatile i8, i8* %fptr, align 4
98 %ext = zext i8 %fload to i32
99 store i32 %ext, i32 addrspace(1)* %out, align 4
103 ; CHECK-LABEL: sextload_flat_i16:
104 ; CHECK: flat_load_sshort
105 define amdgpu_kernel void @sextload_flat_i16(i32 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %gptr) #0 {
106 %fptr = addrspacecast i16 addrspace(1)* %gptr to i16*
107 %fload = load volatile i16, i16* %fptr, align 4
108 %ext = sext i16 %fload to i32
109 store i32 %ext, i32 addrspace(1)* %out, align 4
113 ; CHECK-LABEL: zextload_flat_i16:
114 ; CHECK: flat_load_ushort
115 define amdgpu_kernel void @zextload_flat_i16(i32 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %gptr) #0 {
116 %fptr = addrspacecast i16 addrspace(1)* %gptr to i16*
117 %fload = load volatile i16, i16* %fptr, align 4
118 %ext = zext i16 %fload to i32
119 store i32 %ext, i32 addrspace(1)* %out, align 4
123 ; CHECK-LABEL: flat_scratch_unaligned_load:
124 ; CHECK: flat_load_ubyte
125 ; CHECK: flat_load_ubyte
126 ; CHECK: flat_load_ubyte
127 ; CHECK: flat_load_ubyte
128 define amdgpu_kernel void @flat_scratch_unaligned_load() {
129 %scratch = alloca i32, addrspace(5)
130 %fptr = addrspacecast i32 addrspace(5)* %scratch to i32*
131 %ld = load volatile i32, i32* %fptr, align 1
135 ; CHECK-LABEL: flat_scratch_unaligned_store:
136 ; CHECK: flat_store_byte
137 ; CHECK: flat_store_byte
138 ; CHECK: flat_store_byte
139 ; CHECK: flat_store_byte
140 define amdgpu_kernel void @flat_scratch_unaligned_store() {
141 %scratch = alloca i32, addrspace(5)
142 %fptr = addrspacecast i32 addrspace(5)* %scratch to i32*
143 store volatile i32 0, i32* %fptr, align 1
147 ; CHECK-LABEL: flat_scratch_multidword_load:
148 ; HSA: flat_load_dword
149 ; HSA: flat_load_dword
150 ; FIXME: These tests are broken for os = mesa3d, becasue it doesn't initialize flat_scr
151 define amdgpu_kernel void @flat_scratch_multidword_load() {
152 %scratch = alloca <2 x i32>, addrspace(5)
153 %fptr = addrspacecast <2 x i32> addrspace(5)* %scratch to <2 x i32>*
154 %ld = load volatile <2 x i32>, <2 x i32>* %fptr
158 ; CHECK-LABEL: flat_scratch_multidword_store:
159 ; HSA: flat_store_dword
160 ; HSA: flat_store_dword
161 ; FIXME: These tests are broken for os = mesa3d, becasue it doesn't initialize flat_scr
162 define amdgpu_kernel void @flat_scratch_multidword_store() {
163 %scratch = alloca <2 x i32>, addrspace(5)
164 %fptr = addrspacecast <2 x i32> addrspace(5)* %scratch to <2 x i32>*
165 store volatile <2 x i32> zeroinitializer, <2 x i32>* %fptr
169 ; CHECK-LABEL: {{^}}store_flat_i8_max_offset:
170 ; CIVI: flat_store_byte v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}{{$}}
171 ; GFX9: flat_store_byte v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:4095{{$}}
172 define amdgpu_kernel void @store_flat_i8_max_offset(i8* %fptr, i8 %x) #0 {
173 %fptr.offset = getelementptr inbounds i8, i8* %fptr, i64 4095
174 store volatile i8 %x, i8* %fptr.offset
178 ; CHECK-LABEL: {{^}}store_flat_i8_max_offset_p1:
179 ; CHECK: flat_store_byte v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}{{$}}
180 define amdgpu_kernel void @store_flat_i8_max_offset_p1(i8* %fptr, i8 %x) #0 {
181 %fptr.offset = getelementptr inbounds i8, i8* %fptr, i64 4096
182 store volatile i8 %x, i8* %fptr.offset
186 ; CHECK-LABEL: {{^}}store_flat_i8_neg_offset:
187 ; CIVI: flat_store_byte v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}{{$}}
189 ; GFX9: v_add_co_u32_e32 v{{[0-9]+}}, vcc, 0xfffff000, v
190 ; GFX9: v_addc_co_u32_e32 v{{[0-9]+}}, vcc, -1,
191 ; GFX9: flat_store_byte v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:4094{{$}}
192 define amdgpu_kernel void @store_flat_i8_neg_offset(i8* %fptr, i8 %x) #0 {
193 %fptr.offset = getelementptr inbounds i8, i8* %fptr, i64 -2
194 store volatile i8 %x, i8* %fptr.offset
198 ; CHECK-LABEL: {{^}}load_flat_i8_max_offset:
199 ; CIVI: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}{{$}}
200 ; GFX9: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} offset:4095{{$}}
201 define amdgpu_kernel void @load_flat_i8_max_offset(i8* %fptr) #0 {
202 %fptr.offset = getelementptr inbounds i8, i8* %fptr, i64 4095
203 %val = load volatile i8, i8* %fptr.offset
207 ; CHECK-LABEL: {{^}}load_flat_i8_max_offset_p1:
208 ; CHECK: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}{{$}}
209 define amdgpu_kernel void @load_flat_i8_max_offset_p1(i8* %fptr) #0 {
210 %fptr.offset = getelementptr inbounds i8, i8* %fptr, i64 4096
211 %val = load volatile i8, i8* %fptr.offset
215 ; CHECK-LABEL: {{^}}load_flat_i8_neg_offset:
216 ; CIVI: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}{{$}}
218 ; GFX9: v_add_co_u32_e32 v{{[0-9]+}}, vcc, 0xfffff000, v
219 ; GFX9: v_addc_co_u32_e32 v{{[0-9]+}}, vcc, -1,
220 ; GFX9: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} offset:4094{{$}}
221 define amdgpu_kernel void @load_flat_i8_neg_offset(i8* %fptr) #0 {
222 %fptr.offset = getelementptr inbounds i8, i8* %fptr, i64 -2
223 %val = load volatile i8, i8* %fptr.offset
227 attributes #0 = { nounwind }
228 attributes #1 = { nounwind convergent }