Run DCE after a LoopFlatten test to reduce spurious output [nfc]
[llvm-project.git] / llvm / test / CodeGen / AMDGPU / llvm.amdgcn.raw.tbuffer.store.d16.ll
blobea47d3b968d95ed8d66056111549c512ca6f4451
1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefixes=PREGFX10-UNPACKED %s
3 ; RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -check-prefixes=PREGFX10-PACKED %s
4 ; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefixes=PREGFX10-PACKED %s
5 ; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck -check-prefixes=GFX10-PACKED %s
6 ; RUN: llc < %s -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 -verify-machineinstrs | FileCheck -check-prefixes=GFX11-PACKED %s
8 define amdgpu_kernel void @tbuffer_store_d16_x(<4 x i32> %rsrc, half %data) {
9 ; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_x:
10 ; PREGFX10-UNPACKED:       ; %bb.0: ; %main_body
11 ; PREGFX10-UNPACKED-NEXT:    s_load_dword s4, s[0:1], 0x34
12 ; PREGFX10-UNPACKED-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
13 ; PREGFX10-UNPACKED-NEXT:    s_waitcnt lgkmcnt(0)
14 ; PREGFX10-UNPACKED-NEXT:    v_mov_b32_e32 v0, s4
15 ; PREGFX10-UNPACKED-NEXT:    tbuffer_store_format_d16_x v0, off, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED]
16 ; PREGFX10-UNPACKED-NEXT:    s_endpgm
18 ; PREGFX10-PACKED-LABEL: tbuffer_store_d16_x:
19 ; PREGFX10-PACKED:       ; %bb.0: ; %main_body
20 ; PREGFX10-PACKED-NEXT:    s_load_dword s2, s[0:1], 0x34
21 ; PREGFX10-PACKED-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
22 ; PREGFX10-PACKED-NEXT:    s_waitcnt lgkmcnt(0)
23 ; PREGFX10-PACKED-NEXT:    v_mov_b32_e32 v0, s2
24 ; PREGFX10-PACKED-NEXT:    tbuffer_store_format_d16_x v0, off, s[4:7], 0 format:[BUF_NUM_FORMAT_USCALED]
25 ; PREGFX10-PACKED-NEXT:    s_endpgm
27 ; GFX10-PACKED-LABEL: tbuffer_store_d16_x:
28 ; GFX10-PACKED:       ; %bb.0: ; %main_body
29 ; GFX10-PACKED-NEXT:    s_clause 0x1
30 ; GFX10-PACKED-NEXT:    s_load_dword s2, s[0:1], 0x34
31 ; GFX10-PACKED-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
32 ; GFX10-PACKED-NEXT:    s_waitcnt lgkmcnt(0)
33 ; GFX10-PACKED-NEXT:    v_mov_b32_e32 v0, s2
34 ; GFX10-PACKED-NEXT:    tbuffer_store_format_d16_x v0, off, s[4:7], 0 format:[BUF_FMT_10_11_11_SSCALED]
35 ; GFX10-PACKED-NEXT:    s_endpgm
37 ; GFX11-PACKED-LABEL: tbuffer_store_d16_x:
38 ; GFX11-PACKED:       ; %bb.0: ; %main_body
39 ; GFX11-PACKED-NEXT:    s_clause 0x1
40 ; GFX11-PACKED-NEXT:    s_load_b32 s4, s[0:1], 0x34
41 ; GFX11-PACKED-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
42 ; GFX11-PACKED-NEXT:    s_waitcnt lgkmcnt(0)
43 ; GFX11-PACKED-NEXT:    v_mov_b32_e32 v0, s4
44 ; GFX11-PACKED-NEXT:    tbuffer_store_d16_format_x v0, off, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM]
45 ; GFX11-PACKED-NEXT:    s_nop 0
46 ; GFX11-PACKED-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
47 ; GFX11-PACKED-NEXT:    s_endpgm
48 main_body:
49   call void @llvm.amdgcn.raw.tbuffer.store.f16(half %data, <4 x i32> %rsrc, i32 0, i32 0, i32 33, i32 0)
50   ret void
53 define amdgpu_kernel void @tbuffer_store_d16_xy(<4 x i32> %rsrc, <2 x half> %data) {
54 ; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_xy:
55 ; PREGFX10-UNPACKED:       ; %bb.0: ; %main_body
56 ; PREGFX10-UNPACKED-NEXT:    s_load_dword s4, s[0:1], 0x34
57 ; PREGFX10-UNPACKED-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
58 ; PREGFX10-UNPACKED-NEXT:    s_waitcnt lgkmcnt(0)
59 ; PREGFX10-UNPACKED-NEXT:    s_lshr_b32 s5, s4, 16
60 ; PREGFX10-UNPACKED-NEXT:    s_and_b32 s4, s4, 0xffff
61 ; PREGFX10-UNPACKED-NEXT:    v_mov_b32_e32 v0, s4
62 ; PREGFX10-UNPACKED-NEXT:    v_mov_b32_e32 v1, s5
63 ; PREGFX10-UNPACKED-NEXT:    tbuffer_store_format_d16_xy v[0:1], off, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED]
64 ; PREGFX10-UNPACKED-NEXT:    s_endpgm
66 ; PREGFX10-PACKED-LABEL: tbuffer_store_d16_xy:
67 ; PREGFX10-PACKED:       ; %bb.0: ; %main_body
68 ; PREGFX10-PACKED-NEXT:    s_load_dword s2, s[0:1], 0x34
69 ; PREGFX10-PACKED-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
70 ; PREGFX10-PACKED-NEXT:    s_waitcnt lgkmcnt(0)
71 ; PREGFX10-PACKED-NEXT:    v_mov_b32_e32 v0, s2
72 ; PREGFX10-PACKED-NEXT:    tbuffer_store_format_d16_xy v0, off, s[4:7], 0 format:[BUF_NUM_FORMAT_USCALED]
73 ; PREGFX10-PACKED-NEXT:    s_endpgm
75 ; GFX10-PACKED-LABEL: tbuffer_store_d16_xy:
76 ; GFX10-PACKED:       ; %bb.0: ; %main_body
77 ; GFX10-PACKED-NEXT:    s_clause 0x1
78 ; GFX10-PACKED-NEXT:    s_load_dword s2, s[0:1], 0x34
79 ; GFX10-PACKED-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
80 ; GFX10-PACKED-NEXT:    s_waitcnt lgkmcnt(0)
81 ; GFX10-PACKED-NEXT:    v_mov_b32_e32 v0, s2
82 ; GFX10-PACKED-NEXT:    tbuffer_store_format_d16_xy v0, off, s[4:7], 0 format:[BUF_FMT_10_11_11_SSCALED]
83 ; GFX10-PACKED-NEXT:    s_endpgm
85 ; GFX11-PACKED-LABEL: tbuffer_store_d16_xy:
86 ; GFX11-PACKED:       ; %bb.0: ; %main_body
87 ; GFX11-PACKED-NEXT:    s_clause 0x1
88 ; GFX11-PACKED-NEXT:    s_load_b32 s4, s[0:1], 0x34
89 ; GFX11-PACKED-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
90 ; GFX11-PACKED-NEXT:    s_waitcnt lgkmcnt(0)
91 ; GFX11-PACKED-NEXT:    v_mov_b32_e32 v0, s4
92 ; GFX11-PACKED-NEXT:    tbuffer_store_d16_format_xy v0, off, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM]
93 ; GFX11-PACKED-NEXT:    s_nop 0
94 ; GFX11-PACKED-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
95 ; GFX11-PACKED-NEXT:    s_endpgm
96 main_body:
97   call void @llvm.amdgcn.raw.tbuffer.store.v2f16(<2 x half> %data, <4 x i32> %rsrc, i32 0, i32 0, i32 33, i32 0)
98   ret void
101 define amdgpu_kernel void @tbuffer_store_d16_xyz(<4 x i32> %rsrc, <4 x half> %data) {
102 ; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_xyz:
103 ; PREGFX10-UNPACKED:       ; %bb.0: ; %main_body
104 ; PREGFX10-UNPACKED-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
105 ; PREGFX10-UNPACKED-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
106 ; PREGFX10-UNPACKED-NEXT:    s_waitcnt lgkmcnt(0)
107 ; PREGFX10-UNPACKED-NEXT:    s_and_b32 s5, s5, 0xffff
108 ; PREGFX10-UNPACKED-NEXT:    s_lshr_b32 s6, s4, 16
109 ; PREGFX10-UNPACKED-NEXT:    s_and_b32 s4, s4, 0xffff
110 ; PREGFX10-UNPACKED-NEXT:    v_mov_b32_e32 v0, s4
111 ; PREGFX10-UNPACKED-NEXT:    v_mov_b32_e32 v1, s6
112 ; PREGFX10-UNPACKED-NEXT:    v_mov_b32_e32 v2, s5
113 ; PREGFX10-UNPACKED-NEXT:    tbuffer_store_format_d16_xyz v[0:2], off, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED]
114 ; PREGFX10-UNPACKED-NEXT:    s_endpgm
116 ; PREGFX10-PACKED-LABEL: tbuffer_store_d16_xyz:
117 ; PREGFX10-PACKED:       ; %bb.0: ; %main_body
118 ; PREGFX10-PACKED-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
119 ; PREGFX10-PACKED-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
120 ; PREGFX10-PACKED-NEXT:    s_waitcnt lgkmcnt(0)
121 ; PREGFX10-PACKED-NEXT:    s_and_b32 s0, s3, 0xffff
122 ; PREGFX10-PACKED-NEXT:    v_mov_b32_e32 v0, s2
123 ; PREGFX10-PACKED-NEXT:    v_mov_b32_e32 v1, s0
124 ; PREGFX10-PACKED-NEXT:    tbuffer_store_format_d16_xyz v[0:1], off, s[4:7], 0 format:[BUF_NUM_FORMAT_USCALED]
125 ; PREGFX10-PACKED-NEXT:    s_endpgm
127 ; GFX10-PACKED-LABEL: tbuffer_store_d16_xyz:
128 ; GFX10-PACKED:       ; %bb.0: ; %main_body
129 ; GFX10-PACKED-NEXT:    s_clause 0x1
130 ; GFX10-PACKED-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
131 ; GFX10-PACKED-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
132 ; GFX10-PACKED-NEXT:    s_waitcnt lgkmcnt(0)
133 ; GFX10-PACKED-NEXT:    s_and_b32 s0, s3, 0xffff
134 ; GFX10-PACKED-NEXT:    v_mov_b32_e32 v0, s2
135 ; GFX10-PACKED-NEXT:    v_mov_b32_e32 v1, s0
136 ; GFX10-PACKED-NEXT:    tbuffer_store_format_d16_xyz v[0:1], off, s[4:7], 0 format:[BUF_FMT_10_11_11_SSCALED]
137 ; GFX10-PACKED-NEXT:    s_endpgm
139 ; GFX11-PACKED-LABEL: tbuffer_store_d16_xyz:
140 ; GFX11-PACKED:       ; %bb.0: ; %main_body
141 ; GFX11-PACKED-NEXT:    s_clause 0x1
142 ; GFX11-PACKED-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
143 ; GFX11-PACKED-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
144 ; GFX11-PACKED-NEXT:    s_waitcnt lgkmcnt(0)
145 ; GFX11-PACKED-NEXT:    s_and_b32 s5, s5, 0xffff
146 ; GFX11-PACKED-NEXT:    v_mov_b32_e32 v0, s4
147 ; GFX11-PACKED-NEXT:    v_mov_b32_e32 v1, s5
148 ; GFX11-PACKED-NEXT:    tbuffer_store_d16_format_xyz v[0:1], off, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM]
149 ; GFX11-PACKED-NEXT:    s_nop 0
150 ; GFX11-PACKED-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
151 ; GFX11-PACKED-NEXT:    s_endpgm
152 main_body:
153   %data_subvec = shufflevector <4 x half> %data, <4 x half> undef, <3 x i32> <i32 0, i32 1, i32 2>
154   call void @llvm.amdgcn.raw.tbuffer.store.v3f16(<3 x half> %data_subvec, <4 x i32> %rsrc, i32 0, i32 0, i32 33, i32 0)
155   ret void
158 define amdgpu_kernel void @tbuffer_store_d16_xyzw(<4 x i32> %rsrc, <4 x half> %data) {
159 ; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_xyzw:
160 ; PREGFX10-UNPACKED:       ; %bb.0: ; %main_body
161 ; PREGFX10-UNPACKED-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
162 ; PREGFX10-UNPACKED-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
163 ; PREGFX10-UNPACKED-NEXT:    s_waitcnt lgkmcnt(0)
164 ; PREGFX10-UNPACKED-NEXT:    s_lshr_b32 s6, s5, 16
165 ; PREGFX10-UNPACKED-NEXT:    s_and_b32 s5, s5, 0xffff
166 ; PREGFX10-UNPACKED-NEXT:    s_lshr_b32 s7, s4, 16
167 ; PREGFX10-UNPACKED-NEXT:    s_and_b32 s4, s4, 0xffff
168 ; PREGFX10-UNPACKED-NEXT:    v_mov_b32_e32 v0, s4
169 ; PREGFX10-UNPACKED-NEXT:    v_mov_b32_e32 v1, s7
170 ; PREGFX10-UNPACKED-NEXT:    v_mov_b32_e32 v2, s5
171 ; PREGFX10-UNPACKED-NEXT:    v_mov_b32_e32 v3, s6
172 ; PREGFX10-UNPACKED-NEXT:    tbuffer_store_format_d16_xyzw v[0:3], off, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED]
173 ; PREGFX10-UNPACKED-NEXT:    s_endpgm
175 ; PREGFX10-PACKED-LABEL: tbuffer_store_d16_xyzw:
176 ; PREGFX10-PACKED:       ; %bb.0: ; %main_body
177 ; PREGFX10-PACKED-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
178 ; PREGFX10-PACKED-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
179 ; PREGFX10-PACKED-NEXT:    s_waitcnt lgkmcnt(0)
180 ; PREGFX10-PACKED-NEXT:    v_mov_b32_e32 v0, s2
181 ; PREGFX10-PACKED-NEXT:    v_mov_b32_e32 v1, s3
182 ; PREGFX10-PACKED-NEXT:    tbuffer_store_format_d16_xyzw v[0:1], off, s[4:7], 0 format:[BUF_NUM_FORMAT_USCALED]
183 ; PREGFX10-PACKED-NEXT:    s_endpgm
185 ; GFX10-PACKED-LABEL: tbuffer_store_d16_xyzw:
186 ; GFX10-PACKED:       ; %bb.0: ; %main_body
187 ; GFX10-PACKED-NEXT:    s_clause 0x1
188 ; GFX10-PACKED-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
189 ; GFX10-PACKED-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
190 ; GFX10-PACKED-NEXT:    s_waitcnt lgkmcnt(0)
191 ; GFX10-PACKED-NEXT:    v_mov_b32_e32 v0, s2
192 ; GFX10-PACKED-NEXT:    v_mov_b32_e32 v1, s3
193 ; GFX10-PACKED-NEXT:    tbuffer_store_format_d16_xyzw v[0:1], off, s[4:7], 0 format:[BUF_FMT_10_11_11_SSCALED]
194 ; GFX10-PACKED-NEXT:    s_endpgm
196 ; GFX11-PACKED-LABEL: tbuffer_store_d16_xyzw:
197 ; GFX11-PACKED:       ; %bb.0: ; %main_body
198 ; GFX11-PACKED-NEXT:    s_clause 0x1
199 ; GFX11-PACKED-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
200 ; GFX11-PACKED-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
201 ; GFX11-PACKED-NEXT:    s_waitcnt lgkmcnt(0)
202 ; GFX11-PACKED-NEXT:    v_mov_b32_e32 v0, s4
203 ; GFX11-PACKED-NEXT:    v_mov_b32_e32 v1, s5
204 ; GFX11-PACKED-NEXT:    tbuffer_store_d16_format_xyzw v[0:1], off, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM]
205 ; GFX11-PACKED-NEXT:    s_nop 0
206 ; GFX11-PACKED-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
207 ; GFX11-PACKED-NEXT:    s_endpgm
208 main_body:
209   call void @llvm.amdgcn.raw.tbuffer.store.v4f16(<4 x half> %data, <4 x i32> %rsrc, i32 0, i32 0, i32 33, i32 0)
210   ret void
213 declare void @llvm.amdgcn.raw.tbuffer.store.f16(half, <4 x i32>, i32, i32, i32, i32)
214 declare void @llvm.amdgcn.raw.tbuffer.store.v2f16(<2 x half>, <4 x i32>, i32, i32, i32, i32)
215 declare void @llvm.amdgcn.raw.tbuffer.store.v3f16(<3 x half>, <4 x i32>, i32, i32, i32, i32)
216 declare void @llvm.amdgcn.raw.tbuffer.store.v4f16(<4 x half>, <4 x i32>, i32, i32, i32, i32)