1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefixes=TONGA %s
3 ; RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -check-prefixes=GFX81 %s
4 ; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefixes=GFX9 %s
5 ; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
6 ; RUN: llc < %s -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
8 define amdgpu_ps half @image_sample_2d_f16(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) {
9 ; TONGA-LABEL: image_sample_2d_f16:
10 ; TONGA: ; %bb.0: ; %main_body
11 ; TONGA-NEXT: s_mov_b64 s[12:13], exec
12 ; TONGA-NEXT: s_wqm_b64 exec, exec
13 ; TONGA-NEXT: s_and_b64 exec, exec, s[12:13]
14 ; TONGA-NEXT: image_sample v0, v[0:1], s[0:7], s[8:11] dmask:0x1 d16
15 ; TONGA-NEXT: s_waitcnt vmcnt(0)
16 ; TONGA-NEXT: ; return to shader part epilog
18 ; GFX81-LABEL: image_sample_2d_f16:
19 ; GFX81: ; %bb.0: ; %main_body
20 ; GFX81-NEXT: s_mov_b64 s[12:13], exec
21 ; GFX81-NEXT: s_wqm_b64 exec, exec
22 ; GFX81-NEXT: s_and_b64 exec, exec, s[12:13]
23 ; GFX81-NEXT: image_sample v0, v[0:1], s[0:7], s[8:11] dmask:0x1 d16
24 ; GFX81-NEXT: s_waitcnt vmcnt(0)
25 ; GFX81-NEXT: ; return to shader part epilog
27 ; GFX9-LABEL: image_sample_2d_f16:
28 ; GFX9: ; %bb.0: ; %main_body
29 ; GFX9-NEXT: s_mov_b64 s[12:13], exec
30 ; GFX9-NEXT: s_wqm_b64 exec, exec
31 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13]
32 ; GFX9-NEXT: image_sample v0, v[0:1], s[0:7], s[8:11] dmask:0x1 d16
33 ; GFX9-NEXT: s_waitcnt vmcnt(0)
34 ; GFX9-NEXT: ; return to shader part epilog
36 ; GFX10PLUS-LABEL: image_sample_2d_f16:
37 ; GFX10PLUS: ; %bb.0: ; %main_body
38 ; GFX10PLUS-NEXT: s_mov_b32 s12, exec_lo
39 ; GFX10PLUS-NEXT: s_wqm_b32 exec_lo, exec_lo
40 ; GFX10PLUS-NEXT: s_and_b32 exec_lo, exec_lo, s12
41 ; GFX10PLUS-NEXT: image_sample v0, v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D d16
42 ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0)
43 ; GFX10PLUS-NEXT: ; return to shader part epilog
45 %tex = call half @llvm.amdgcn.image.sample.2d.f16.f32(i32 1, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
49 define amdgpu_ps half @image_sample_2d_f16_tfe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, ptr addrspace(1) inreg %out) {
50 ; TONGA-LABEL: image_sample_2d_f16_tfe:
51 ; TONGA: ; %bb.0: ; %main_body
52 ; TONGA-NEXT: s_mov_b64 s[14:15], exec
53 ; TONGA-NEXT: s_wqm_b64 exec, exec
54 ; TONGA-NEXT: v_mov_b32_e32 v2, 0
55 ; TONGA-NEXT: v_mov_b32_e32 v3, v2
56 ; TONGA-NEXT: s_and_b64 exec, exec, s[14:15]
57 ; TONGA-NEXT: image_sample v[2:3], v[0:1], s[0:7], s[8:11] dmask:0x1 tfe d16
58 ; TONGA-NEXT: v_mov_b32_e32 v0, s12
59 ; TONGA-NEXT: v_mov_b32_e32 v1, s13
60 ; TONGA-NEXT: s_waitcnt vmcnt(0)
61 ; TONGA-NEXT: flat_store_dword v[0:1], v3
62 ; TONGA-NEXT: v_mov_b32_e32 v0, v2
63 ; TONGA-NEXT: s_waitcnt vmcnt(0)
64 ; TONGA-NEXT: ; return to shader part epilog
66 ; GFX81-LABEL: image_sample_2d_f16_tfe:
67 ; GFX81: ; %bb.0: ; %main_body
68 ; GFX81-NEXT: s_mov_b64 s[14:15], exec
69 ; GFX81-NEXT: s_wqm_b64 exec, exec
70 ; GFX81-NEXT: v_mov_b32_e32 v2, 0
71 ; GFX81-NEXT: v_mov_b32_e32 v3, v2
72 ; GFX81-NEXT: s_and_b64 exec, exec, s[14:15]
73 ; GFX81-NEXT: image_sample v[2:3], v[0:1], s[0:7], s[8:11] dmask:0x1 tfe d16
74 ; GFX81-NEXT: v_mov_b32_e32 v0, s12
75 ; GFX81-NEXT: v_mov_b32_e32 v1, s13
76 ; GFX81-NEXT: s_waitcnt vmcnt(0)
77 ; GFX81-NEXT: flat_store_dword v[0:1], v3
78 ; GFX81-NEXT: v_mov_b32_e32 v0, v2
79 ; GFX81-NEXT: s_waitcnt vmcnt(0)
80 ; GFX81-NEXT: ; return to shader part epilog
82 ; GFX9-LABEL: image_sample_2d_f16_tfe:
83 ; GFX9: ; %bb.0: ; %main_body
84 ; GFX9-NEXT: s_mov_b64 s[14:15], exec
85 ; GFX9-NEXT: s_wqm_b64 exec, exec
86 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
87 ; GFX9-NEXT: v_mov_b32_e32 v5, v4
88 ; GFX9-NEXT: v_mov_b32_e32 v2, v4
89 ; GFX9-NEXT: v_mov_b32_e32 v3, v5
90 ; GFX9-NEXT: s_and_b64 exec, exec, s[14:15]
91 ; GFX9-NEXT: image_sample v[2:3], v[0:1], s[0:7], s[8:11] dmask:0x1 tfe d16
92 ; GFX9-NEXT: s_waitcnt vmcnt(0)
93 ; GFX9-NEXT: v_mov_b32_e32 v0, v2
94 ; GFX9-NEXT: global_store_dword v4, v3, s[12:13]
95 ; GFX9-NEXT: s_waitcnt vmcnt(0)
96 ; GFX9-NEXT: ; return to shader part epilog
98 ; GFX10-LABEL: image_sample_2d_f16_tfe:
99 ; GFX10: ; %bb.0: ; %main_body
100 ; GFX10-NEXT: s_mov_b32 s14, exec_lo
101 ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo
102 ; GFX10-NEXT: v_mov_b32_e32 v4, 0
103 ; GFX10-NEXT: v_mov_b32_e32 v5, v4
104 ; GFX10-NEXT: v_mov_b32_e32 v2, v4
105 ; GFX10-NEXT: v_mov_b32_e32 v3, v5
106 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s14
107 ; GFX10-NEXT: image_sample v[2:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe d16
108 ; GFX10-NEXT: s_waitcnt vmcnt(0)
109 ; GFX10-NEXT: v_mov_b32_e32 v0, v2
110 ; GFX10-NEXT: global_store_dword v4, v3, s[12:13]
111 ; GFX10-NEXT: ; return to shader part epilog
113 ; GFX11-LABEL: image_sample_2d_f16_tfe:
114 ; GFX11: ; %bb.0: ; %main_body
115 ; GFX11-NEXT: s_mov_b32 s14, exec_lo
116 ; GFX11-NEXT: s_wqm_b32 exec_lo, exec_lo
117 ; GFX11-NEXT: v_mov_b32_e32 v4, 0
118 ; GFX11-NEXT: v_mov_b32_e32 v5, v4
119 ; GFX11-NEXT: v_dual_mov_b32 v2, v4 :: v_dual_mov_b32 v3, v5
120 ; GFX11-NEXT: s_and_b32 exec_lo, exec_lo, s14
121 ; GFX11-NEXT: image_sample v[2:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe d16
122 ; GFX11-NEXT: s_waitcnt vmcnt(0)
123 ; GFX11-NEXT: v_mov_b32_e32 v0, v2
124 ; GFX11-NEXT: global_store_b32 v4, v3, s[12:13]
125 ; GFX11-NEXT: ; return to shader part epilog
127 %tex = call {half,i32} @llvm.amdgcn.image.sample.2d.f16i32.f32(i32 1, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 1, i32 0)
128 %tex.vec = extractvalue {half, i32} %tex, 0
129 %tex.err = extractvalue {half, i32} %tex, 1
130 store i32 %tex.err, ptr addrspace(1) %out, align 4
134 define amdgpu_ps float @image_sample_c_d_1d_v2f16(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dsdv, float %s) {
135 ; TONGA-LABEL: image_sample_c_d_1d_v2f16:
136 ; TONGA: ; %bb.0: ; %main_body
137 ; TONGA-NEXT: image_sample_c_d v[0:1], v[0:3], s[0:7], s[8:11] dmask:0x3 d16
138 ; TONGA-NEXT: s_waitcnt vmcnt(0)
139 ; TONGA-NEXT: v_lshlrev_b32_e32 v1, 16, v1
140 ; TONGA-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
141 ; TONGA-NEXT: ; return to shader part epilog
143 ; GFX81-LABEL: image_sample_c_d_1d_v2f16:
144 ; GFX81: ; %bb.0: ; %main_body
145 ; GFX81-NEXT: image_sample_c_d v0, v[0:3], s[0:7], s[8:11] dmask:0x3 d16
146 ; GFX81-NEXT: s_waitcnt vmcnt(0)
147 ; GFX81-NEXT: ; return to shader part epilog
149 ; GFX9-LABEL: image_sample_c_d_1d_v2f16:
150 ; GFX9: ; %bb.0: ; %main_body
151 ; GFX9-NEXT: image_sample_c_d v0, v[0:3], s[0:7], s[8:11] dmask:0x3 d16
152 ; GFX9-NEXT: s_waitcnt vmcnt(0)
153 ; GFX9-NEXT: ; return to shader part epilog
155 ; GFX10PLUS-LABEL: image_sample_c_d_1d_v2f16:
156 ; GFX10PLUS: ; %bb.0: ; %main_body
157 ; GFX10PLUS-NEXT: image_sample_c_d v0, v[0:3], s[0:7], s[8:11] dmask:0x3 dim:SQ_RSRC_IMG_1D d16
158 ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0)
159 ; GFX10PLUS-NEXT: ; return to shader part epilog
161 %tex = call <2 x half> @llvm.amdgcn.image.sample.c.d.1d.v2f16.f32.f32(i32 3, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
162 %r = bitcast <2 x half> %tex to float
166 define amdgpu_ps <2 x float> @image_sample_c_d_1d_v2f16_tfe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dsdv, float %s) {
167 ; TONGA-LABEL: image_sample_c_d_1d_v2f16_tfe:
168 ; TONGA: ; %bb.0: ; %main_body
169 ; TONGA-NEXT: v_mov_b32_e32 v4, 0
170 ; TONGA-NEXT: v_mov_b32_e32 v5, v4
171 ; TONGA-NEXT: v_mov_b32_e32 v6, v4
172 ; TONGA-NEXT: image_sample_c_d v[4:6], v[0:3], s[0:7], s[8:11] dmask:0x3 tfe d16
173 ; TONGA-NEXT: s_waitcnt vmcnt(0)
174 ; TONGA-NEXT: v_lshlrev_b32_e32 v0, 16, v5
175 ; TONGA-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
176 ; TONGA-NEXT: v_mov_b32_e32 v1, v6
177 ; TONGA-NEXT: ; return to shader part epilog
179 ; GFX81-LABEL: image_sample_c_d_1d_v2f16_tfe:
180 ; GFX81: ; %bb.0: ; %main_body
181 ; GFX81-NEXT: v_mov_b32_e32 v4, 0
182 ; GFX81-NEXT: v_mov_b32_e32 v5, v4
183 ; GFX81-NEXT: image_sample_c_d v[4:5], v[0:3], s[0:7], s[8:11] dmask:0x3 tfe d16
184 ; GFX81-NEXT: s_waitcnt vmcnt(0)
185 ; GFX81-NEXT: v_mov_b32_e32 v0, v4
186 ; GFX81-NEXT: v_mov_b32_e32 v1, v5
187 ; GFX81-NEXT: ; return to shader part epilog
189 ; GFX9-LABEL: image_sample_c_d_1d_v2f16_tfe:
190 ; GFX9: ; %bb.0: ; %main_body
191 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
192 ; GFX9-NEXT: v_mov_b32_e32 v5, v4
193 ; GFX9-NEXT: image_sample_c_d v[4:5], v[0:3], s[0:7], s[8:11] dmask:0x3 tfe d16
194 ; GFX9-NEXT: s_waitcnt vmcnt(0)
195 ; GFX9-NEXT: v_mov_b32_e32 v0, v4
196 ; GFX9-NEXT: v_mov_b32_e32 v1, v5
197 ; GFX9-NEXT: ; return to shader part epilog
199 ; GFX10-LABEL: image_sample_c_d_1d_v2f16_tfe:
200 ; GFX10: ; %bb.0: ; %main_body
201 ; GFX10-NEXT: v_mov_b32_e32 v5, v0
202 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
203 ; GFX10-NEXT: v_mov_b32_e32 v4, v1
204 ; GFX10-NEXT: v_mov_b32_e32 v1, v0
205 ; GFX10-NEXT: image_sample_c_d v[0:1], [v5, v4, v2, v3], s[0:7], s[8:11] dmask:0x3 dim:SQ_RSRC_IMG_1D tfe d16
206 ; GFX10-NEXT: s_waitcnt vmcnt(0)
207 ; GFX10-NEXT: ; return to shader part epilog
209 ; GFX11-LABEL: image_sample_c_d_1d_v2f16_tfe:
210 ; GFX11: ; %bb.0: ; %main_body
211 ; GFX11-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0
212 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
213 ; GFX11-NEXT: v_mov_b32_e32 v1, v0
214 ; GFX11-NEXT: image_sample_c_d v[0:1], [v5, v4, v2, v3], s[0:7], s[8:11] dmask:0x3 dim:SQ_RSRC_IMG_1D tfe d16
215 ; GFX11-NEXT: s_waitcnt vmcnt(0)
216 ; GFX11-NEXT: ; return to shader part epilog
218 %tex = call {<2 x half>,i32} @llvm.amdgcn.image.sample.c.d.1d.v2f16i32.f32.f32(i32 3, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 1, i32 0)
219 %tex.vec = extractvalue {<2 x half>, i32} %tex, 0
220 %tex.err = extractvalue {<2 x half>, i32} %tex, 1
221 %tex.vecf = bitcast <2 x half> %tex.vec to float
222 %r.0 = insertelement <2 x float> undef, float %tex.vecf, i32 0
223 %tex.errf = bitcast i32 %tex.err to float
224 %r = insertelement <2 x float> %r.0, float %tex.errf, i32 1
228 define amdgpu_ps <2 x float> @image_sample_b_2d_v3f16(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %s, float %t) {
229 ; TONGA-LABEL: image_sample_b_2d_v3f16:
230 ; TONGA: ; %bb.0: ; %main_body
231 ; TONGA-NEXT: s_mov_b64 s[12:13], exec
232 ; TONGA-NEXT: s_wqm_b64 exec, exec
233 ; TONGA-NEXT: s_and_b64 exec, exec, s[12:13]
234 ; TONGA-NEXT: image_sample_b v[0:2], v[0:2], s[0:7], s[8:11] dmask:0x7 d16
235 ; TONGA-NEXT: s_mov_b32 s0, 0x1000504
236 ; TONGA-NEXT: s_waitcnt vmcnt(0)
237 ; TONGA-NEXT: v_perm_b32 v0, v0, v1, s0
238 ; TONGA-NEXT: v_mov_b32_e32 v1, v2
239 ; TONGA-NEXT: ; return to shader part epilog
241 ; GFX81-LABEL: image_sample_b_2d_v3f16:
242 ; GFX81: ; %bb.0: ; %main_body
243 ; GFX81-NEXT: s_mov_b64 s[12:13], exec
244 ; GFX81-NEXT: s_wqm_b64 exec, exec
245 ; GFX81-NEXT: s_and_b64 exec, exec, s[12:13]
246 ; GFX81-NEXT: image_sample_b v[0:1], v[0:2], s[0:7], s[8:11] dmask:0x7 d16
247 ; GFX81-NEXT: s_waitcnt vmcnt(0)
248 ; GFX81-NEXT: ; return to shader part epilog
250 ; GFX9-LABEL: image_sample_b_2d_v3f16:
251 ; GFX9: ; %bb.0: ; %main_body
252 ; GFX9-NEXT: s_mov_b64 s[12:13], exec
253 ; GFX9-NEXT: s_wqm_b64 exec, exec
254 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13]
255 ; GFX9-NEXT: image_sample_b v[0:1], v[0:2], s[0:7], s[8:11] dmask:0x7 d16
256 ; GFX9-NEXT: s_waitcnt vmcnt(0)
257 ; GFX9-NEXT: ; return to shader part epilog
259 ; GFX10PLUS-LABEL: image_sample_b_2d_v3f16:
260 ; GFX10PLUS: ; %bb.0: ; %main_body
261 ; GFX10PLUS-NEXT: s_mov_b32 s12, exec_lo
262 ; GFX10PLUS-NEXT: s_wqm_b32 exec_lo, exec_lo
263 ; GFX10PLUS-NEXT: s_and_b32 exec_lo, exec_lo, s12
264 ; GFX10PLUS-NEXT: image_sample_b v[0:1], v[0:2], s[0:7], s[8:11] dmask:0x7 dim:SQ_RSRC_IMG_2D d16
265 ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0)
266 ; GFX10PLUS-NEXT: ; return to shader part epilog
268 %tex = call <3 x half> @llvm.amdgcn.image.sample.b.2d.v3f16.f32.f32(i32 7, float %bias, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
269 %tex_wide = shufflevector <3 x half> %tex, <3 x half> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
270 %r = bitcast <4 x half> %tex_wide to <2 x float>
274 define amdgpu_ps <4 x float> @image_sample_b_2d_v3f16_tfe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %s, float %t) {
275 ; TONGA-LABEL: image_sample_b_2d_v3f16_tfe:
276 ; TONGA: ; %bb.0: ; %main_body
277 ; TONGA-NEXT: s_mov_b64 s[12:13], exec
278 ; TONGA-NEXT: s_wqm_b64 exec, exec
279 ; TONGA-NEXT: v_mov_b32_e32 v3, 0
280 ; TONGA-NEXT: v_mov_b32_e32 v4, v3
281 ; TONGA-NEXT: v_mov_b32_e32 v5, v3
282 ; TONGA-NEXT: v_mov_b32_e32 v6, v3
283 ; TONGA-NEXT: s_and_b64 exec, exec, s[12:13]
284 ; TONGA-NEXT: image_sample_b v[3:6], v[0:2], s[0:7], s[8:11] dmask:0x7 tfe d16
285 ; TONGA-NEXT: s_mov_b32 s0, 0x1000504
286 ; TONGA-NEXT: s_waitcnt vmcnt(0)
287 ; TONGA-NEXT: v_perm_b32 v0, v3, v4, s0
288 ; TONGA-NEXT: v_mov_b32_e32 v1, v5
289 ; TONGA-NEXT: v_mov_b32_e32 v2, v6
290 ; TONGA-NEXT: ; return to shader part epilog
292 ; GFX81-LABEL: image_sample_b_2d_v3f16_tfe:
293 ; GFX81: ; %bb.0: ; %main_body
294 ; GFX81-NEXT: s_mov_b64 s[12:13], exec
295 ; GFX81-NEXT: s_wqm_b64 exec, exec
296 ; GFX81-NEXT: v_mov_b32_e32 v3, 0
297 ; GFX81-NEXT: v_mov_b32_e32 v4, v3
298 ; GFX81-NEXT: v_mov_b32_e32 v5, v3
299 ; GFX81-NEXT: s_and_b64 exec, exec, s[12:13]
300 ; GFX81-NEXT: image_sample_b v[3:5], v[0:2], s[0:7], s[8:11] dmask:0x7 tfe d16
301 ; GFX81-NEXT: s_waitcnt vmcnt(0)
302 ; GFX81-NEXT: v_mov_b32_e32 v0, v3
303 ; GFX81-NEXT: v_mov_b32_e32 v1, v4
304 ; GFX81-NEXT: v_mov_b32_e32 v2, v5
305 ; GFX81-NEXT: ; return to shader part epilog
307 ; GFX9-LABEL: image_sample_b_2d_v3f16_tfe:
308 ; GFX9: ; %bb.0: ; %main_body
309 ; GFX9-NEXT: s_mov_b64 s[12:13], exec
310 ; GFX9-NEXT: s_wqm_b64 exec, exec
311 ; GFX9-NEXT: v_mov_b32_e32 v3, 0
312 ; GFX9-NEXT: v_mov_b32_e32 v4, v3
313 ; GFX9-NEXT: v_mov_b32_e32 v5, v3
314 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13]
315 ; GFX9-NEXT: image_sample_b v[3:5], v[0:2], s[0:7], s[8:11] dmask:0x7 tfe d16
316 ; GFX9-NEXT: s_waitcnt vmcnt(0)
317 ; GFX9-NEXT: v_mov_b32_e32 v0, v3
318 ; GFX9-NEXT: v_mov_b32_e32 v1, v4
319 ; GFX9-NEXT: v_mov_b32_e32 v2, v5
320 ; GFX9-NEXT: ; return to shader part epilog
322 ; GFX10-LABEL: image_sample_b_2d_v3f16_tfe:
323 ; GFX10: ; %bb.0: ; %main_body
324 ; GFX10-NEXT: s_mov_b32 s12, exec_lo
325 ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo
326 ; GFX10-NEXT: v_mov_b32_e32 v3, v0
327 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
328 ; GFX10-NEXT: v_mov_b32_e32 v5, v2
329 ; GFX10-NEXT: v_mov_b32_e32 v4, v1
330 ; GFX10-NEXT: v_mov_b32_e32 v1, v0
331 ; GFX10-NEXT: v_mov_b32_e32 v2, v0
332 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12
333 ; GFX10-NEXT: image_sample_b v[0:2], v[3:5], s[0:7], s[8:11] dmask:0x7 dim:SQ_RSRC_IMG_2D tfe d16
334 ; GFX10-NEXT: s_waitcnt vmcnt(0)
335 ; GFX10-NEXT: ; return to shader part epilog
337 ; GFX11-LABEL: image_sample_b_2d_v3f16_tfe:
338 ; GFX11: ; %bb.0: ; %main_body
339 ; GFX11-NEXT: s_mov_b32 s12, exec_lo
340 ; GFX11-NEXT: s_wqm_b32 exec_lo, exec_lo
341 ; GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v0, 0
342 ; GFX11-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1
343 ; GFX11-NEXT: v_mov_b32_e32 v1, v0
344 ; GFX11-NEXT: v_mov_b32_e32 v2, v0
345 ; GFX11-NEXT: s_and_b32 exec_lo, exec_lo, s12
346 ; GFX11-NEXT: image_sample_b v[0:2], v[3:5], s[0:7], s[8:11] dmask:0x7 dim:SQ_RSRC_IMG_2D tfe d16
347 ; GFX11-NEXT: s_waitcnt vmcnt(0)
348 ; GFX11-NEXT: ; return to shader part epilog
350 %tex = call {<3 x half>,i32} @llvm.amdgcn.image.sample.b.2d.v3f16i32.f32.f32(i32 7, float %bias, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 1, i32 0)
351 %tex.vec = extractvalue {<3 x half>, i32} %tex, 0
352 %tex.vec_wide = shufflevector <3 x half> %tex.vec, <3 x half> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
353 %tex.err = extractvalue {<3 x half>, i32} %tex, 1
354 %tex.vecf = bitcast <4 x half> %tex.vec_wide to <2 x float>
355 %tex.vecf.0 = extractelement <2 x float> %tex.vecf, i32 0
356 %tex.vecf.1 = extractelement <2 x float> %tex.vecf, i32 1
357 %r.0 = insertelement <4 x float> undef, float %tex.vecf.0, i32 0
358 %r.1 = insertelement <4 x float> %r.0, float %tex.vecf.1, i32 1
359 %tex.errf = bitcast i32 %tex.err to float
360 %r = insertelement <4 x float> %r.1, float %tex.errf, i32 2
364 define amdgpu_ps <2 x float> @image_sample_b_2d_v4f16(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %s, float %t) {
365 ; TONGA-LABEL: image_sample_b_2d_v4f16:
366 ; TONGA: ; %bb.0: ; %main_body
367 ; TONGA-NEXT: s_mov_b64 s[12:13], exec
368 ; TONGA-NEXT: s_wqm_b64 exec, exec
369 ; TONGA-NEXT: s_and_b64 exec, exec, s[12:13]
370 ; TONGA-NEXT: image_sample_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf d16
371 ; TONGA-NEXT: s_mov_b32 s0, 0x1000504
372 ; TONGA-NEXT: s_waitcnt vmcnt(0)
373 ; TONGA-NEXT: v_perm_b32 v0, v0, v1, s0
374 ; TONGA-NEXT: v_perm_b32 v1, v2, v3, s0
375 ; TONGA-NEXT: ; return to shader part epilog
377 ; GFX81-LABEL: image_sample_b_2d_v4f16:
378 ; GFX81: ; %bb.0: ; %main_body
379 ; GFX81-NEXT: s_mov_b64 s[12:13], exec
380 ; GFX81-NEXT: s_wqm_b64 exec, exec
381 ; GFX81-NEXT: s_and_b64 exec, exec, s[12:13]
382 ; GFX81-NEXT: image_sample_b v[0:1], v[0:2], s[0:7], s[8:11] dmask:0xf d16
383 ; GFX81-NEXT: s_waitcnt vmcnt(0)
384 ; GFX81-NEXT: ; return to shader part epilog
386 ; GFX9-LABEL: image_sample_b_2d_v4f16:
387 ; GFX9: ; %bb.0: ; %main_body
388 ; GFX9-NEXT: s_mov_b64 s[12:13], exec
389 ; GFX9-NEXT: s_wqm_b64 exec, exec
390 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13]
391 ; GFX9-NEXT: image_sample_b v[0:1], v[0:2], s[0:7], s[8:11] dmask:0xf d16
392 ; GFX9-NEXT: s_waitcnt vmcnt(0)
393 ; GFX9-NEXT: ; return to shader part epilog
395 ; GFX10PLUS-LABEL: image_sample_b_2d_v4f16:
396 ; GFX10PLUS: ; %bb.0: ; %main_body
397 ; GFX10PLUS-NEXT: s_mov_b32 s12, exec_lo
398 ; GFX10PLUS-NEXT: s_wqm_b32 exec_lo, exec_lo
399 ; GFX10PLUS-NEXT: s_and_b32 exec_lo, exec_lo, s12
400 ; GFX10PLUS-NEXT: image_sample_b v[0:1], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D d16
401 ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0)
402 ; GFX10PLUS-NEXT: ; return to shader part epilog
404 %tex = call <4 x half> @llvm.amdgcn.image.sample.b.2d.v4f16.f32.f32(i32 15, float %bias, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
405 %r = bitcast <4 x half> %tex to <2 x float>
409 define amdgpu_ps <4 x float> @image_sample_b_2d_v4f16_tfe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %s, float %t) {
410 ; TONGA-LABEL: image_sample_b_2d_v4f16_tfe:
411 ; TONGA: ; %bb.0: ; %main_body
412 ; TONGA-NEXT: s_mov_b64 s[12:13], exec
413 ; TONGA-NEXT: s_wqm_b64 exec, exec
414 ; TONGA-NEXT: v_mov_b32_e32 v3, 0
415 ; TONGA-NEXT: v_mov_b32_e32 v4, v3
416 ; TONGA-NEXT: v_mov_b32_e32 v5, v3
417 ; TONGA-NEXT: v_mov_b32_e32 v6, v3
418 ; TONGA-NEXT: v_mov_b32_e32 v7, v3
419 ; TONGA-NEXT: s_and_b64 exec, exec, s[12:13]
420 ; TONGA-NEXT: image_sample_b v[3:7], v[0:2], s[0:7], s[8:11] dmask:0xf tfe d16
421 ; TONGA-NEXT: s_mov_b32 s0, 0x1000504
422 ; TONGA-NEXT: s_waitcnt vmcnt(0)
423 ; TONGA-NEXT: v_perm_b32 v0, v3, v4, s0
424 ; TONGA-NEXT: v_perm_b32 v1, v5, v6, s0
425 ; TONGA-NEXT: v_mov_b32_e32 v2, v7
426 ; TONGA-NEXT: ; return to shader part epilog
428 ; GFX81-LABEL: image_sample_b_2d_v4f16_tfe:
429 ; GFX81: ; %bb.0: ; %main_body
430 ; GFX81-NEXT: s_mov_b64 s[12:13], exec
431 ; GFX81-NEXT: s_wqm_b64 exec, exec
432 ; GFX81-NEXT: v_mov_b32_e32 v3, 0
433 ; GFX81-NEXT: v_mov_b32_e32 v4, v3
434 ; GFX81-NEXT: v_mov_b32_e32 v5, v3
435 ; GFX81-NEXT: s_and_b64 exec, exec, s[12:13]
436 ; GFX81-NEXT: image_sample_b v[3:5], v[0:2], s[0:7], s[8:11] dmask:0xf tfe d16
437 ; GFX81-NEXT: s_waitcnt vmcnt(0)
438 ; GFX81-NEXT: v_mov_b32_e32 v0, v3
439 ; GFX81-NEXT: v_mov_b32_e32 v1, v4
440 ; GFX81-NEXT: v_mov_b32_e32 v2, v5
441 ; GFX81-NEXT: ; return to shader part epilog
443 ; GFX9-LABEL: image_sample_b_2d_v4f16_tfe:
444 ; GFX9: ; %bb.0: ; %main_body
445 ; GFX9-NEXT: s_mov_b64 s[12:13], exec
446 ; GFX9-NEXT: s_wqm_b64 exec, exec
447 ; GFX9-NEXT: v_mov_b32_e32 v3, 0
448 ; GFX9-NEXT: v_mov_b32_e32 v4, v3
449 ; GFX9-NEXT: v_mov_b32_e32 v5, v3
450 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13]
451 ; GFX9-NEXT: image_sample_b v[3:5], v[0:2], s[0:7], s[8:11] dmask:0xf tfe d16
452 ; GFX9-NEXT: s_waitcnt vmcnt(0)
453 ; GFX9-NEXT: v_mov_b32_e32 v0, v3
454 ; GFX9-NEXT: v_mov_b32_e32 v1, v4
455 ; GFX9-NEXT: v_mov_b32_e32 v2, v5
456 ; GFX9-NEXT: ; return to shader part epilog
458 ; GFX10-LABEL: image_sample_b_2d_v4f16_tfe:
459 ; GFX10: ; %bb.0: ; %main_body
460 ; GFX10-NEXT: s_mov_b32 s12, exec_lo
461 ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo
462 ; GFX10-NEXT: v_mov_b32_e32 v3, v0
463 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
464 ; GFX10-NEXT: v_mov_b32_e32 v5, v2
465 ; GFX10-NEXT: v_mov_b32_e32 v4, v1
466 ; GFX10-NEXT: v_mov_b32_e32 v1, v0
467 ; GFX10-NEXT: v_mov_b32_e32 v2, v0
468 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12
469 ; GFX10-NEXT: image_sample_b v[0:2], v[3:5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D tfe d16
470 ; GFX10-NEXT: s_waitcnt vmcnt(0)
471 ; GFX10-NEXT: ; return to shader part epilog
473 ; GFX11-LABEL: image_sample_b_2d_v4f16_tfe:
474 ; GFX11: ; %bb.0: ; %main_body
475 ; GFX11-NEXT: s_mov_b32 s12, exec_lo
476 ; GFX11-NEXT: s_wqm_b32 exec_lo, exec_lo
477 ; GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v0, 0
478 ; GFX11-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1
479 ; GFX11-NEXT: v_mov_b32_e32 v1, v0
480 ; GFX11-NEXT: v_mov_b32_e32 v2, v0
481 ; GFX11-NEXT: s_and_b32 exec_lo, exec_lo, s12
482 ; GFX11-NEXT: image_sample_b v[0:2], v[3:5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D tfe d16
483 ; GFX11-NEXT: s_waitcnt vmcnt(0)
484 ; GFX11-NEXT: ; return to shader part epilog
486 %tex = call {<4 x half>,i32} @llvm.amdgcn.image.sample.b.2d.v4f16i32.f32.f32(i32 15, float %bias, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 1, i32 0)
487 %tex.vec = extractvalue {<4 x half>, i32} %tex, 0
488 %tex.err = extractvalue {<4 x half>, i32} %tex, 1
489 %tex.vecf = bitcast <4 x half> %tex.vec to <2 x float>
490 %tex.vecf.0 = extractelement <2 x float> %tex.vecf, i32 0
491 %tex.vecf.1 = extractelement <2 x float> %tex.vecf, i32 1
492 %r.0 = insertelement <4 x float> undef, float %tex.vecf.0, i32 0
493 %r.1 = insertelement <4 x float> %r.0, float %tex.vecf.1, i32 1
494 %tex.errf = bitcast i32 %tex.err to float
495 %r = insertelement <4 x float> %r.1, float %tex.errf, i32 2
499 declare half @llvm.amdgcn.image.sample.2d.f16.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
500 declare {half,i32} @llvm.amdgcn.image.sample.2d.f16i32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
501 declare <3 x half> @llvm.amdgcn.image.sample.2d.v3f16.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
502 declare <4 x half> @llvm.amdgcn.image.sample.2d.v4f16.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
503 declare {<2 x half>,i32} @llvm.amdgcn.image.sample.2d.v2f16i32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
504 declare <2 x half> @llvm.amdgcn.image.sample.c.d.1d.v2f16.f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
505 declare {<2 x half>,i32} @llvm.amdgcn.image.sample.c.d.1d.v2f16i32.f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
506 declare <3 x half> @llvm.amdgcn.image.sample.b.2d.v3f16.f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
507 declare {<3 x half>,i32} @llvm.amdgcn.image.sample.b.2d.v3f16i32.f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
508 declare <4 x half> @llvm.amdgcn.image.sample.b.2d.v4f16.f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
509 declare {<4 x half>,i32} @llvm.amdgcn.image.sample.b.2d.v4f16i32.f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
511 attributes #0 = { nounwind }
512 attributes #1 = { nounwind readonly }
513 attributes #2 = { nounwind readnone }