1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2 ;RUN: llc < %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefixes=CHECK,SI %s
3 ;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefixes=CHECK,VI %s
5 define amdgpu_ps void @buffer_store(ptr addrspace(8) inreg, <4 x float>, <4 x float>, <4 x float>) {
6 ; CHECK-LABEL: buffer_store:
7 ; CHECK: ; %bb.0: ; %main_body
8 ; CHECK-NEXT: v_mov_b32_e32 v12, 0
9 ; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v12, s[0:3], 0 idxen
10 ; CHECK-NEXT: buffer_store_dwordx4 v[4:7], v12, s[0:3], 0 idxen glc
11 ; CHECK-NEXT: buffer_store_dwordx4 v[8:11], v12, s[0:3], 0 idxen slc
12 ; CHECK-NEXT: s_endpgm
14 call void @llvm.amdgcn.struct.ptr.buffer.store.v4f32(<4 x float> %1, ptr addrspace(8) %0, i32 0, i32 0, i32 0, i32 0)
15 call void @llvm.amdgcn.struct.ptr.buffer.store.v4f32(<4 x float> %2, ptr addrspace(8) %0, i32 0, i32 0, i32 0, i32 1)
16 call void @llvm.amdgcn.struct.ptr.buffer.store.v4f32(<4 x float> %3, ptr addrspace(8) %0, i32 0, i32 0, i32 0, i32 2)
20 define amdgpu_ps void @buffer_store_immoffs(ptr addrspace(8) inreg, <4 x float>) {
21 ; CHECK-LABEL: buffer_store_immoffs:
22 ; CHECK: ; %bb.0: ; %main_body
23 ; CHECK-NEXT: v_mov_b32_e32 v4, 0
24 ; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen offset:42
25 ; CHECK-NEXT: s_endpgm
27 call void @llvm.amdgcn.struct.ptr.buffer.store.v4f32(<4 x float> %1, ptr addrspace(8) %0, i32 0, i32 42, i32 0, i32 0)
31 define amdgpu_ps void @buffer_store_idx(ptr addrspace(8) inreg, <4 x float>, i32) {
32 ; CHECK-LABEL: buffer_store_idx:
33 ; CHECK: ; %bb.0: ; %main_body
34 ; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen
35 ; CHECK-NEXT: s_endpgm
37 call void @llvm.amdgcn.struct.ptr.buffer.store.v4f32(<4 x float> %1, ptr addrspace(8) %0, i32 %2, i32 0, i32 0, i32 0)
41 define amdgpu_ps void @buffer_store_ofs(ptr addrspace(8) inreg, <4 x float>, i32) {
42 ; CHECK-LABEL: buffer_store_ofs:
43 ; CHECK: ; %bb.0: ; %main_body
44 ; CHECK-NEXT: s_mov_b32 s4, 0
45 ; CHECK-NEXT: v_mov_b32_e32 v5, v4
46 ; CHECK-NEXT: v_mov_b32_e32 v4, s4
47 ; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v[4:5], s[0:3], 0 idxen offen
48 ; CHECK-NEXT: s_endpgm
50 call void @llvm.amdgcn.struct.ptr.buffer.store.v4f32(<4 x float> %1, ptr addrspace(8) %0, i32 0, i32 %2, i32 0, i32 0)
54 define amdgpu_ps void @buffer_store_both(ptr addrspace(8) inreg, <4 x float>, i32, i32) {
55 ; CHECK-LABEL: buffer_store_both:
56 ; CHECK: ; %bb.0: ; %main_body
57 ; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v[4:5], s[0:3], 0 idxen offen
58 ; CHECK-NEXT: s_endpgm
60 call void @llvm.amdgcn.struct.ptr.buffer.store.v4f32(<4 x float> %1, ptr addrspace(8) %0, i32 %2, i32 %3, i32 0, i32 0)
64 define amdgpu_ps void @buffer_store_both_reversed(ptr addrspace(8) inreg, <4 x float>, i32, i32) {
65 ; CHECK-LABEL: buffer_store_both_reversed:
66 ; CHECK: ; %bb.0: ; %main_body
67 ; CHECK-NEXT: v_mov_b32_e32 v6, v4
68 ; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 idxen offen
69 ; CHECK-NEXT: s_endpgm
71 call void @llvm.amdgcn.struct.ptr.buffer.store.v4f32(<4 x float> %1, ptr addrspace(8) %0, i32 %3, i32 %2, i32 0, i32 0)
75 ; Ideally, the register allocator would avoid the wait here
76 define amdgpu_ps void @buffer_store_wait(ptr addrspace(8) inreg, <4 x float>, i32, i32, i32) {
77 ; SI-LABEL: buffer_store_wait:
78 ; SI: ; %bb.0: ; %main_body
79 ; SI-NEXT: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen
80 ; SI-NEXT: s_waitcnt expcnt(0)
81 ; SI-NEXT: buffer_load_dwordx4 v[0:3], v5, s[0:3], 0 idxen
82 ; SI-NEXT: s_waitcnt vmcnt(0)
83 ; SI-NEXT: buffer_store_dwordx4 v[0:3], v6, s[0:3], 0 idxen
86 ; VI-LABEL: buffer_store_wait:
87 ; VI: ; %bb.0: ; %main_body
88 ; VI-NEXT: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen
89 ; VI-NEXT: buffer_load_dwordx4 v[0:3], v5, s[0:3], 0 idxen
90 ; VI-NEXT: s_waitcnt vmcnt(0)
91 ; VI-NEXT: buffer_store_dwordx4 v[0:3], v6, s[0:3], 0 idxen
94 call void @llvm.amdgcn.struct.ptr.buffer.store.v4f32(<4 x float> %1, ptr addrspace(8) %0, i32 %2, i32 0, i32 0, i32 0)
95 %data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.v4f32(ptr addrspace(8) %0, i32 %3, i32 0, i32 0, i32 0)
96 call void @llvm.amdgcn.struct.ptr.buffer.store.v4f32(<4 x float> %data, ptr addrspace(8) %0, i32 %4, i32 0, i32 0, i32 0)
100 define amdgpu_ps void @buffer_store_x1(ptr addrspace(8) inreg %rsrc, float %data, i32 %index) {
101 ; CHECK-LABEL: buffer_store_x1:
102 ; CHECK: ; %bb.0: ; %main_body
103 ; CHECK-NEXT: buffer_store_dword v0, v1, s[0:3], 0 idxen
104 ; CHECK-NEXT: s_endpgm
106 call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %data, ptr addrspace(8) %rsrc, i32 %index, i32 0, i32 0, i32 0)
110 define amdgpu_ps void @buffer_store_x2(ptr addrspace(8) inreg %rsrc, <2 x float> %data, i32 %index) #0 {
111 ; CHECK-LABEL: buffer_store_x2:
112 ; CHECK: ; %bb.0: ; %main_body
113 ; CHECK-NEXT: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 idxen
114 ; CHECK-NEXT: s_endpgm
116 call void @llvm.amdgcn.struct.ptr.buffer.store.v2f32(<2 x float> %data, ptr addrspace(8) %rsrc, i32 %index, i32 0, i32 0, i32 0)
120 define amdgpu_ps void @buffer_store_int(ptr addrspace(8) inreg, <4 x i32>, <2 x i32>, i32) {
121 ; CHECK-LABEL: buffer_store_int:
122 ; CHECK: ; %bb.0: ; %main_body
123 ; CHECK-NEXT: v_mov_b32_e32 v7, 0
124 ; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v7, s[0:3], 0 idxen
125 ; CHECK-NEXT: buffer_store_dwordx2 v[4:5], v7, s[0:3], 0 idxen glc
126 ; CHECK-NEXT: buffer_store_dword v6, v7, s[0:3], 0 idxen slc
127 ; CHECK-NEXT: s_endpgm
129 call void @llvm.amdgcn.struct.ptr.buffer.store.v4i32(<4 x i32> %1, ptr addrspace(8) %0, i32 0, i32 0, i32 0, i32 0)
130 call void @llvm.amdgcn.struct.ptr.buffer.store.v2i32(<2 x i32> %2, ptr addrspace(8) %0, i32 0, i32 0, i32 0, i32 1)
131 call void @llvm.amdgcn.struct.ptr.buffer.store.i32(i32 %3, ptr addrspace(8) %0, i32 0, i32 0, i32 0, i32 2)
135 define amdgpu_ps void @struct_ptr_buffer_store_byte(ptr addrspace(8) inreg %rsrc, float %v1, i32 %index) {
136 ; CHECK-LABEL: struct_ptr_buffer_store_byte:
137 ; CHECK: ; %bb.0: ; %main_body
138 ; CHECK-NEXT: v_cvt_u32_f32_e32 v0, v0
139 ; CHECK-NEXT: buffer_store_byte v0, v1, s[0:3], 0 idxen
140 ; CHECK-NEXT: s_endpgm
142 %v2 = fptoui float %v1 to i32
143 %v3 = trunc i32 %v2 to i8
144 call void @llvm.amdgcn.struct.ptr.buffer.store.i8(i8 %v3, ptr addrspace(8) %rsrc, i32 %index, i32 0, i32 0, i32 0)
148 define amdgpu_ps void @struct_ptr_buffer_store_f16(ptr addrspace(8) inreg %rsrc, float %v1, i32 %index) {
149 ; CHECK-LABEL: struct_ptr_buffer_store_f16:
151 ; CHECK-NEXT: v_cvt_f16_f32_e32 v0, v0
152 ; CHECK-NEXT: buffer_store_short v0, v1, s[0:3], 0 idxen
153 ; CHECK-NEXT: s_endpgm
154 %v2 = fptrunc float %v1 to half
155 call void @llvm.amdgcn.struct.ptr.buffer.store.f16(half %v2, ptr addrspace(8) %rsrc, i32 %index, i32 0, i32 0, i32 0)
159 define amdgpu_ps void @struct_ptr_buffer_store_v2f16(ptr addrspace(8) inreg %rsrc, <2 x half> %v1, i32 %index) {
160 ; SI-LABEL: struct_ptr_buffer_store_v2f16:
162 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
163 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
164 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
165 ; SI-NEXT: v_or_b32_e32 v0, v0, v1
166 ; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 idxen
169 ; VI-LABEL: struct_ptr_buffer_store_v2f16:
171 ; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 idxen
173 call void @llvm.amdgcn.struct.ptr.buffer.store.v2f16(<2 x half> %v1, ptr addrspace(8) %rsrc, i32 %index, i32 0, i32 0, i32 0)
177 define amdgpu_ps void @struct_ptr_buffer_store_v4f16(ptr addrspace(8) inreg %rsrc, <4 x half> %v1, i32 %index) {
178 ; SI-LABEL: struct_ptr_buffer_store_v4f16:
180 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
181 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
182 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v1
183 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
184 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3
185 ; SI-NEXT: v_or_b32_e32 v1, v2, v1
186 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5
187 ; SI-NEXT: v_or_b32_e32 v0, v0, v2
188 ; SI-NEXT: buffer_store_dwordx2 v[0:1], v4, s[0:3], 0 idxen
191 ; VI-LABEL: struct_ptr_buffer_store_v4f16:
193 ; VI-NEXT: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 idxen
195 call void @llvm.amdgcn.struct.ptr.buffer.store.v4f16(<4 x half> %v1, ptr addrspace(8) %rsrc, i32 %index, i32 0, i32 0, i32 0)
199 define amdgpu_ps void @struct_ptr_buffer_store_i16(ptr addrspace(8) inreg %rsrc, float %v1, i32 %index) {
200 ; CHECK-LABEL: struct_ptr_buffer_store_i16:
201 ; CHECK: ; %bb.0: ; %main_body
202 ; CHECK-NEXT: v_cvt_u32_f32_e32 v0, v0
203 ; CHECK-NEXT: buffer_store_short v0, v1, s[0:3], 0 idxen
204 ; CHECK-NEXT: s_endpgm
206 %v2 = fptoui float %v1 to i32
207 %v3 = trunc i32 %v2 to i16
208 call void @llvm.amdgcn.struct.ptr.buffer.store.i16(i16 %v3, ptr addrspace(8) %rsrc, i32 %index, i32 0, i32 0, i32 0)
212 define amdgpu_ps void @struct_ptr_buffer_store_vif16(ptr addrspace(8) inreg %rsrc, <2 x i16> %v1, i32 %index) {
213 ; SI-LABEL: struct_ptr_buffer_store_vif16:
215 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
216 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
217 ; SI-NEXT: v_or_b32_e32 v0, v0, v1
218 ; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 idxen
221 ; VI-LABEL: struct_ptr_buffer_store_vif16:
223 ; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 idxen
225 call void @llvm.amdgcn.struct.ptr.buffer.store.v2i16(<2 x i16> %v1, ptr addrspace(8) %rsrc, i32 %index, i32 0, i32 0, i32 0)
229 define amdgpu_ps void @struct_ptr_buffer_store_v4i16(ptr addrspace(8) inreg %rsrc, <4 x i16> %v1, i32 %index) {
230 ; SI-LABEL: struct_ptr_buffer_store_v4i16:
232 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
233 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
234 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
235 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
236 ; SI-NEXT: v_or_b32_e32 v2, v2, v3
237 ; SI-NEXT: v_or_b32_e32 v1, v0, v1
238 ; SI-NEXT: buffer_store_dwordx2 v[1:2], v4, s[0:3], 0 idxen
241 ; VI-LABEL: struct_ptr_buffer_store_v4i16:
243 ; VI-NEXT: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 idxen
245 call void @llvm.amdgcn.struct.ptr.buffer.store.v4i16(<4 x i16> %v1, ptr addrspace(8) %rsrc, i32 %index, i32 0, i32 0, i32 0)
249 declare void @llvm.amdgcn.struct.ptr.buffer.store.f32(float, ptr addrspace(8), i32, i32, i32, i32) #0
250 declare void @llvm.amdgcn.struct.ptr.buffer.store.v2f32(<2 x float>, ptr addrspace(8), i32, i32, i32, i32) #0
251 declare void @llvm.amdgcn.struct.ptr.buffer.store.v4f32(<4 x float>, ptr addrspace(8), i32, i32, i32, i32) #0
252 declare void @llvm.amdgcn.struct.ptr.buffer.store.i32(i32, ptr addrspace(8), i32, i32, i32, i32) #0
253 declare void @llvm.amdgcn.struct.ptr.buffer.store.v2i32(<2 x i32>, ptr addrspace(8), i32, i32, i32, i32) #0
254 declare void @llvm.amdgcn.struct.ptr.buffer.store.v4i32(<4 x i32>, ptr addrspace(8), i32, i32, i32, i32) #0
255 declare <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.v4f32(ptr addrspace(8), i32, i32, i32, i32) #1
256 declare void @llvm.amdgcn.struct.ptr.buffer.store.i8(i8, ptr addrspace(8), i32, i32, i32, i32) #0
257 declare void @llvm.amdgcn.struct.ptr.buffer.store.i16(i16, ptr addrspace(8), i32, i32, i32, i32) #0
258 declare void @llvm.amdgcn.struct.ptr.buffer.store.v2i16(<2 x i16>, ptr addrspace(8), i32, i32, i32, i32) #0
259 declare void @llvm.amdgcn.struct.ptr.buffer.store.v4i16(<4 x i16>, ptr addrspace(8), i32, i32, i32, i32) #0
260 declare void @llvm.amdgcn.struct.ptr.buffer.store.f16(half, ptr addrspace(8), i32, i32, i32, i32) #0
261 declare void @llvm.amdgcn.struct.ptr.buffer.store.v2f16(<2 x half>, ptr addrspace(8), i32, i32, i32, i32) #0
262 declare void @llvm.amdgcn.struct.ptr.buffer.store.v4f16(<4 x half>, ptr addrspace(8), i32, i32, i32, i32) #0
265 attributes #0 = { nounwind }
266 attributes #1 = { nounwind readonly }