1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=PREGFX10-UNPACKED %s
3 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx810 -verify-machineinstrs < %s | FileCheck -check-prefixes=PREGFX10-PACKED %s
4 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=PREGFX10-PACKED %s
5 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-PACKED %s
6 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-PACKED %s
8 define amdgpu_kernel void @tbuffer_store_d16_x(ptr addrspace(8) %rsrc, half %data, i32 %vindex) {
9 ; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_x:
10 ; PREGFX10-UNPACKED: ; %bb.0: ; %main_body
11 ; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
12 ; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
13 ; PREGFX10-UNPACKED-NEXT: s_waitcnt lgkmcnt(0)
14 ; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s4
15 ; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v1, s5
16 ; PREGFX10-UNPACKED-NEXT: tbuffer_store_format_d16_x v0, v1, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED] idxen
17 ; PREGFX10-UNPACKED-NEXT: s_endpgm
19 ; PREGFX10-PACKED-LABEL: tbuffer_store_d16_x:
20 ; PREGFX10-PACKED: ; %bb.0: ; %main_body
21 ; PREGFX10-PACKED-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
22 ; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
23 ; PREGFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0)
24 ; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s4
25 ; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v1, s5
26 ; PREGFX10-PACKED-NEXT: tbuffer_store_format_d16_x v0, v1, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED] idxen
27 ; PREGFX10-PACKED-NEXT: s_endpgm
29 ; GFX10-PACKED-LABEL: tbuffer_store_d16_x:
30 ; GFX10-PACKED: ; %bb.0: ; %main_body
31 ; GFX10-PACKED-NEXT: s_clause 0x1
32 ; GFX10-PACKED-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
33 ; GFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
34 ; GFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0)
35 ; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s4
36 ; GFX10-PACKED-NEXT: v_mov_b32_e32 v1, s5
37 ; GFX10-PACKED-NEXT: tbuffer_store_format_d16_x v0, v1, s[0:3], 0 format:[BUF_FMT_10_11_11_SSCALED] idxen
38 ; GFX10-PACKED-NEXT: s_endpgm
40 ; GFX11-PACKED-LABEL: tbuffer_store_d16_x:
41 ; GFX11-PACKED: ; %bb.0: ; %main_body
42 ; GFX11-PACKED-NEXT: s_clause 0x1
43 ; GFX11-PACKED-NEXT: s_load_b64 s[6:7], s[4:5], 0x10
44 ; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
45 ; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0)
46 ; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s6
47 ; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s7
48 ; GFX11-PACKED-NEXT: tbuffer_store_d16_format_x v0, v1, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] idxen
49 ; GFX11-PACKED-NEXT: s_endpgm
51 call void @llvm.amdgcn.struct.ptr.tbuffer.store.f16(half %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 33, i32 0)
55 define amdgpu_kernel void @tbuffer_store_d16_xy(ptr addrspace(8) %rsrc, <2 x half> %data, i32 %vindex) {
56 ; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_xy:
57 ; PREGFX10-UNPACKED: ; %bb.0: ; %main_body
58 ; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
59 ; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
60 ; PREGFX10-UNPACKED-NEXT: s_waitcnt lgkmcnt(0)
61 ; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s6, s4, 16
62 ; PREGFX10-UNPACKED-NEXT: s_and_b32 s4, s4, 0xffff
63 ; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s4
64 ; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v1, s6
65 ; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v2, s5
66 ; PREGFX10-UNPACKED-NEXT: tbuffer_store_format_d16_xy v[0:1], v2, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED] idxen
67 ; PREGFX10-UNPACKED-NEXT: s_endpgm
69 ; PREGFX10-PACKED-LABEL: tbuffer_store_d16_xy:
70 ; PREGFX10-PACKED: ; %bb.0: ; %main_body
71 ; PREGFX10-PACKED-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
72 ; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
73 ; PREGFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0)
74 ; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s4
75 ; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v1, s5
76 ; PREGFX10-PACKED-NEXT: tbuffer_store_format_d16_xy v0, v1, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED] idxen
77 ; PREGFX10-PACKED-NEXT: s_endpgm
79 ; GFX10-PACKED-LABEL: tbuffer_store_d16_xy:
80 ; GFX10-PACKED: ; %bb.0: ; %main_body
81 ; GFX10-PACKED-NEXT: s_clause 0x1
82 ; GFX10-PACKED-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
83 ; GFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
84 ; GFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0)
85 ; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s4
86 ; GFX10-PACKED-NEXT: v_mov_b32_e32 v1, s5
87 ; GFX10-PACKED-NEXT: tbuffer_store_format_d16_xy v0, v1, s[0:3], 0 format:[BUF_FMT_10_11_11_SSCALED] idxen
88 ; GFX10-PACKED-NEXT: s_endpgm
90 ; GFX11-PACKED-LABEL: tbuffer_store_d16_xy:
91 ; GFX11-PACKED: ; %bb.0: ; %main_body
92 ; GFX11-PACKED-NEXT: s_clause 0x1
93 ; GFX11-PACKED-NEXT: s_load_b64 s[6:7], s[4:5], 0x10
94 ; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
95 ; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0)
96 ; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s6
97 ; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s7
98 ; GFX11-PACKED-NEXT: tbuffer_store_d16_format_xy v0, v1, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] idxen
99 ; GFX11-PACKED-NEXT: s_endpgm
101 call void @llvm.amdgcn.struct.ptr.tbuffer.store.v2f16(<2 x half> %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 33, i32 0)
105 define amdgpu_kernel void @tbuffer_store_d16_xyz(ptr addrspace(8) %rsrc, <4 x half> %data, i32 %vindex) {
106 ; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_xyz:
107 ; PREGFX10-UNPACKED: ; %bb.0: ; %main_body
108 ; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
109 ; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
110 ; PREGFX10-UNPACKED-NEXT: s_load_dword s6, s[8:9], 0x18
111 ; PREGFX10-UNPACKED-NEXT: s_waitcnt lgkmcnt(0)
112 ; PREGFX10-UNPACKED-NEXT: s_and_b32 s5, s5, 0xffff
113 ; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s7, s4, 16
114 ; PREGFX10-UNPACKED-NEXT: s_and_b32 s4, s4, 0xffff
115 ; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s4
116 ; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v1, s7
117 ; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v2, s5
118 ; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v3, s6
119 ; PREGFX10-UNPACKED-NEXT: tbuffer_store_format_d16_xyz v[0:2], v3, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED] idxen
120 ; PREGFX10-UNPACKED-NEXT: s_endpgm
122 ; PREGFX10-PACKED-LABEL: tbuffer_store_d16_xyz:
123 ; PREGFX10-PACKED: ; %bb.0: ; %main_body
124 ; PREGFX10-PACKED-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
125 ; PREGFX10-PACKED-NEXT: s_load_dword s6, s[8:9], 0x18
126 ; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
127 ; PREGFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0)
128 ; PREGFX10-PACKED-NEXT: s_and_b32 s5, s5, 0xffff
129 ; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s4
130 ; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v1, s5
131 ; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v2, s6
132 ; PREGFX10-PACKED-NEXT: tbuffer_store_format_d16_xyz v[0:1], v2, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED] idxen
133 ; PREGFX10-PACKED-NEXT: s_endpgm
135 ; GFX10-PACKED-LABEL: tbuffer_store_d16_xyz:
136 ; GFX10-PACKED: ; %bb.0: ; %main_body
137 ; GFX10-PACKED-NEXT: s_clause 0x2
138 ; GFX10-PACKED-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
139 ; GFX10-PACKED-NEXT: s_load_dword s6, s[8:9], 0x18
140 ; GFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
141 ; GFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0)
142 ; GFX10-PACKED-NEXT: s_and_b32 s5, s5, 0xffff
143 ; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s4
144 ; GFX10-PACKED-NEXT: v_mov_b32_e32 v1, s5
145 ; GFX10-PACKED-NEXT: v_mov_b32_e32 v2, s6
146 ; GFX10-PACKED-NEXT: tbuffer_store_format_d16_xyz v[0:1], v2, s[0:3], 0 format:[BUF_FMT_10_11_11_SSCALED] idxen
147 ; GFX10-PACKED-NEXT: s_endpgm
149 ; GFX11-PACKED-LABEL: tbuffer_store_d16_xyz:
150 ; GFX11-PACKED: ; %bb.0: ; %main_body
151 ; GFX11-PACKED-NEXT: s_clause 0x2
152 ; GFX11-PACKED-NEXT: s_load_b64 s[6:7], s[4:5], 0x10
153 ; GFX11-PACKED-NEXT: s_load_b32 s8, s[4:5], 0x18
154 ; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
155 ; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0)
156 ; GFX11-PACKED-NEXT: s_and_b32 s4, s7, 0xffff
157 ; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s6
158 ; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s4
159 ; GFX11-PACKED-NEXT: v_mov_b32_e32 v2, s8
160 ; GFX11-PACKED-NEXT: tbuffer_store_d16_format_xyz v[0:1], v2, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] idxen
161 ; GFX11-PACKED-NEXT: s_endpgm
163 %data_subvec = shufflevector <4 x half> %data, <4 x half> undef, <3 x i32> <i32 0, i32 1, i32 2>
164 call void @llvm.amdgcn.struct.ptr.tbuffer.store.v3f16(<3 x half> %data_subvec, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 33, i32 0)
168 define amdgpu_kernel void @tbuffer_store_d16_xyzw(ptr addrspace(8) %rsrc, <4 x half> %data, i32 %vindex) {
169 ; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_xyzw:
170 ; PREGFX10-UNPACKED: ; %bb.0: ; %main_body
171 ; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
172 ; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
173 ; PREGFX10-UNPACKED-NEXT: s_load_dword s6, s[8:9], 0x18
174 ; PREGFX10-UNPACKED-NEXT: s_waitcnt lgkmcnt(0)
175 ; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s7, s5, 16
176 ; PREGFX10-UNPACKED-NEXT: s_and_b32 s5, s5, 0xffff
177 ; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s8, s4, 16
178 ; PREGFX10-UNPACKED-NEXT: s_and_b32 s4, s4, 0xffff
179 ; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s4
180 ; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v1, s8
181 ; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v2, s5
182 ; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v3, s7
183 ; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v4, s6
184 ; PREGFX10-UNPACKED-NEXT: tbuffer_store_format_d16_xyzw v[0:3], v4, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED] idxen
185 ; PREGFX10-UNPACKED-NEXT: s_endpgm
187 ; PREGFX10-PACKED-LABEL: tbuffer_store_d16_xyzw:
188 ; PREGFX10-PACKED: ; %bb.0: ; %main_body
189 ; PREGFX10-PACKED-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
190 ; PREGFX10-PACKED-NEXT: s_load_dword s6, s[8:9], 0x18
191 ; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
192 ; PREGFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0)
193 ; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s4
194 ; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v1, s5
195 ; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v2, s6
196 ; PREGFX10-PACKED-NEXT: tbuffer_store_format_d16_xyzw v[0:1], v2, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED] idxen
197 ; PREGFX10-PACKED-NEXT: s_endpgm
199 ; GFX10-PACKED-LABEL: tbuffer_store_d16_xyzw:
200 ; GFX10-PACKED: ; %bb.0: ; %main_body
201 ; GFX10-PACKED-NEXT: s_clause 0x2
202 ; GFX10-PACKED-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
203 ; GFX10-PACKED-NEXT: s_load_dword s6, s[8:9], 0x18
204 ; GFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
205 ; GFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0)
206 ; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s4
207 ; GFX10-PACKED-NEXT: v_mov_b32_e32 v1, s5
208 ; GFX10-PACKED-NEXT: v_mov_b32_e32 v2, s6
209 ; GFX10-PACKED-NEXT: tbuffer_store_format_d16_xyzw v[0:1], v2, s[0:3], 0 format:[BUF_FMT_10_11_11_SSCALED] idxen
210 ; GFX10-PACKED-NEXT: s_endpgm
212 ; GFX11-PACKED-LABEL: tbuffer_store_d16_xyzw:
213 ; GFX11-PACKED: ; %bb.0: ; %main_body
214 ; GFX11-PACKED-NEXT: s_clause 0x2
215 ; GFX11-PACKED-NEXT: s_load_b64 s[6:7], s[4:5], 0x10
216 ; GFX11-PACKED-NEXT: s_load_b32 s8, s[4:5], 0x18
217 ; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
218 ; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0)
219 ; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s6
220 ; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s7
221 ; GFX11-PACKED-NEXT: v_mov_b32_e32 v2, s8
222 ; GFX11-PACKED-NEXT: tbuffer_store_d16_format_xyzw v[0:1], v2, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] idxen
223 ; GFX11-PACKED-NEXT: s_endpgm
225 call void @llvm.amdgcn.struct.ptr.tbuffer.store.v4f16(<4 x half> %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 33, i32 0)
229 declare void @llvm.amdgcn.struct.ptr.tbuffer.store.f16(half, ptr addrspace(8), i32, i32, i32, i32, i32)
230 declare void @llvm.amdgcn.struct.ptr.tbuffer.store.v2f16(<2 x half>, ptr addrspace(8), i32, i32, i32, i32, i32)
231 declare void @llvm.amdgcn.struct.ptr.tbuffer.store.v3f16(<3 x half>, ptr addrspace(8), i32, i32, i32, i32, i32)
232 declare void @llvm.amdgcn.struct.ptr.tbuffer.store.v4f16(<4 x half>, ptr addrspace(8), i32, i32, i32, i32, i32)