1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
3 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
4 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s
6 define amdgpu_ps void @v_interp_f32(float inreg %i, float inreg %j, i32 inreg %m0) #0 {
7 ; GFX11-LABEL: v_interp_f32:
8 ; GFX11: ; %bb.0: ; %main_body
9 ; GFX11-NEXT: s_mov_b32 s3, exec_lo
10 ; GFX11-NEXT: s_wqm_b32 exec_lo, exec_lo
11 ; GFX11-NEXT: s_mov_b32 m0, s2
12 ; GFX11-NEXT: lds_param_load v0, attr0.y wait_vdst:15
13 ; GFX11-NEXT: lds_param_load v1, attr1.x wait_vdst:15
14 ; GFX11-NEXT: s_mov_b32 exec_lo, s3
15 ; GFX11-NEXT: v_mov_b32_e32 v2, s0
16 ; GFX11-NEXT: v_mov_b32_e32 v4, s1
17 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
18 ; GFX11-NEXT: v_interp_p10_f32 v3, v0, v2, v0 wait_exp:1
19 ; GFX11-NEXT: v_interp_p10_f32 v2, v1, v2, v1 wait_exp:0
20 ; GFX11-NEXT: v_interp_p2_f32 v5, v0, v4, v3 wait_exp:7
21 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
22 ; GFX11-NEXT: v_interp_p2_f32 v4, v1, v4, v5 wait_exp:7
23 ; GFX11-NEXT: exp mrt0 v3, v2, v5, v4 done
24 ; GFX11-NEXT: s_endpgm
26 ; GFX12-LABEL: v_interp_f32:
27 ; GFX12: ; %bb.0: ; %main_body
28 ; GFX12-NEXT: s_mov_b32 s3, exec_lo
29 ; GFX12-NEXT: s_wqm_b32 exec_lo, exec_lo
30 ; GFX12-NEXT: s_mov_b32 m0, s2
31 ; GFX12-NEXT: ds_param_load v0, attr0.y wait_va_vdst:15 wait_vm_vsrc:1
32 ; GFX12-NEXT: ds_param_load v1, attr1.x wait_va_vdst:15 wait_vm_vsrc:1
33 ; GFX12-NEXT: s_mov_b32 exec_lo, s3
34 ; GFX12-NEXT: v_mov_b32_e32 v2, s0
35 ; GFX12-NEXT: v_mov_b32_e32 v4, s1
36 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
37 ; GFX12-NEXT: v_interp_p10_f32 v3, v0, v2, v0 wait_exp:1
38 ; GFX12-NEXT: v_interp_p10_f32 v2, v1, v2, v1 wait_exp:0
39 ; GFX12-NEXT: v_interp_p2_f32 v5, v0, v4, v3 wait_exp:7
40 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
41 ; GFX12-NEXT: v_interp_p2_f32 v4, v1, v4, v5 wait_exp:7
42 ; GFX12-NEXT: export mrt0 v3, v2, v5, v4 done
43 ; GFX12-NEXT: s_endpgm
45 %p0 = call float @llvm.amdgcn.lds.param.load(i32 1, i32 0, i32 %m0)
46 %p1 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 1, i32 %m0)
47 %p0_0 = call float @llvm.amdgcn.interp.inreg.p10(float %p0, float %i, float %p0)
48 %p1_0 = call float @llvm.amdgcn.interp.inreg.p2(float %p0, float %j, float %p0_0)
49 %p0_1 = call float @llvm.amdgcn.interp.inreg.p10(float %p1, float %i, float %p1)
50 %p1_1 = call float @llvm.amdgcn.interp.inreg.p2(float %p1, float %j, float %p1_0)
51 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %p0_0, float %p0_1, float %p1_0, float %p1_1, i1 true, i1 true) #0
55 define amdgpu_ps void @v_interp_f32_many(float inreg %i, float inreg %j, i32 inreg %m0) #0 {
56 ; GFX11-LABEL: v_interp_f32_many:
57 ; GFX11: ; %bb.0: ; %main_body
58 ; GFX11-NEXT: s_mov_b32 s3, exec_lo
59 ; GFX11-NEXT: s_wqm_b32 exec_lo, exec_lo
60 ; GFX11-NEXT: s_mov_b32 m0, s2
61 ; GFX11-NEXT: lds_param_load v0, attr0.x wait_vdst:15
62 ; GFX11-NEXT: lds_param_load v1, attr1.x wait_vdst:15
63 ; GFX11-NEXT: lds_param_load v2, attr2.x wait_vdst:15
64 ; GFX11-NEXT: lds_param_load v3, attr3.x wait_vdst:15
65 ; GFX11-NEXT: s_mov_b32 exec_lo, s3
66 ; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1
67 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
68 ; GFX11-NEXT: v_interp_p10_f32 v6, v0, v4, v0 wait_exp:3
69 ; GFX11-NEXT: v_interp_p10_f32 v7, v1, v4, v1 wait_exp:2
70 ; GFX11-NEXT: v_interp_p10_f32 v8, v2, v4, v2 wait_exp:1
71 ; GFX11-NEXT: v_interp_p10_f32 v4, v3, v4, v3 wait_exp:0
72 ; GFX11-NEXT: v_interp_p2_f32 v6, v0, v5, v6 wait_exp:7
73 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
74 ; GFX11-NEXT: v_interp_p2_f32 v7, v1, v5, v7 wait_exp:7
75 ; GFX11-NEXT: v_interp_p2_f32 v8, v2, v5, v8 wait_exp:7
76 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
77 ; GFX11-NEXT: v_interp_p2_f32 v4, v3, v5, v4 wait_exp:7
78 ; GFX11-NEXT: exp mrt0 v6, v7, v8, v4 done
79 ; GFX11-NEXT: s_endpgm
81 ; GFX12-LABEL: v_interp_f32_many:
82 ; GFX12: ; %bb.0: ; %main_body
83 ; GFX12-NEXT: s_mov_b32 s3, exec_lo
84 ; GFX12-NEXT: s_wqm_b32 exec_lo, exec_lo
85 ; GFX12-NEXT: s_mov_b32 m0, s2
86 ; GFX12-NEXT: ds_param_load v0, attr0.x wait_va_vdst:15 wait_vm_vsrc:1
87 ; GFX12-NEXT: ds_param_load v1, attr1.x wait_va_vdst:15 wait_vm_vsrc:1
88 ; GFX12-NEXT: ds_param_load v2, attr2.x wait_va_vdst:15 wait_vm_vsrc:1
89 ; GFX12-NEXT: ds_param_load v3, attr3.x wait_va_vdst:15 wait_vm_vsrc:1
90 ; GFX12-NEXT: s_mov_b32 exec_lo, s3
91 ; GFX12-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1
92 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
93 ; GFX12-NEXT: v_interp_p10_f32 v6, v0, v4, v0 wait_exp:3
94 ; GFX12-NEXT: v_interp_p10_f32 v7, v1, v4, v1 wait_exp:2
95 ; GFX12-NEXT: v_interp_p10_f32 v8, v2, v4, v2 wait_exp:1
96 ; GFX12-NEXT: v_interp_p10_f32 v4, v3, v4, v3 wait_exp:0
97 ; GFX12-NEXT: v_interp_p2_f32 v6, v0, v5, v6 wait_exp:7
98 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
99 ; GFX12-NEXT: v_interp_p2_f32 v7, v1, v5, v7 wait_exp:7
100 ; GFX12-NEXT: v_interp_p2_f32 v8, v2, v5, v8 wait_exp:7
101 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4)
102 ; GFX12-NEXT: v_interp_p2_f32 v4, v3, v5, v4 wait_exp:7
103 ; GFX12-NEXT: export mrt0 v6, v7, v8, v4 done
104 ; GFX12-NEXT: s_endpgm
106 %p0 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 0, i32 %m0)
107 %p1 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 1, i32 %m0)
108 %p2 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 2, i32 %m0)
109 %p3 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 3, i32 %m0)
110 %p0_0 = call float @llvm.amdgcn.interp.inreg.p10(float %p0, float %i, float %p0)
111 %p0_1 = call float @llvm.amdgcn.interp.inreg.p2(float %p0, float %j, float %p0_0)
112 %p1_0 = call float @llvm.amdgcn.interp.inreg.p10(float %p1, float %i, float %p1)
113 %p1_1 = call float @llvm.amdgcn.interp.inreg.p2(float %p1, float %j, float %p1_0)
114 %p2_0 = call float @llvm.amdgcn.interp.inreg.p10(float %p2, float %i, float %p2)
115 %p2_1 = call float @llvm.amdgcn.interp.inreg.p2(float %p2, float %j, float %p2_0)
116 %p3_0 = call float @llvm.amdgcn.interp.inreg.p10(float %p3, float %i, float %p3)
117 %p3_1 = call float @llvm.amdgcn.interp.inreg.p2(float %p3, float %j, float %p3_0)
118 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %p0_1, float %p1_1, float %p2_1, float %p3_1, i1 true, i1 true) #0
122 define amdgpu_ps void @v_interp_f32_many_vm(ptr addrspace(1) %ptr, i32 inreg %m0) #0 {
123 ; GFX11-LABEL: v_interp_f32_many_vm:
124 ; GFX11: ; %bb.0: ; %main_body
125 ; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off offset:4
126 ; GFX11-NEXT: s_mov_b32 m0, s0
127 ; GFX11-NEXT: s_mov_b32 s0, exec_lo
128 ; GFX11-NEXT: s_wqm_b32 exec_lo, exec_lo
129 ; GFX11-NEXT: lds_param_load v2, attr0.x wait_vdst:15
130 ; GFX11-NEXT: lds_param_load v3, attr1.x wait_vdst:15
131 ; GFX11-NEXT: lds_param_load v4, attr2.x wait_vdst:15
132 ; GFX11-NEXT: lds_param_load v5, attr3.x wait_vdst:15
133 ; GFX11-NEXT: s_mov_b32 exec_lo, s0
134 ; GFX11-NEXT: s_waitcnt vmcnt(0)
135 ; GFX11-NEXT: v_interp_p10_f32 v6, v2, v0, v2 wait_exp:3
136 ; GFX11-NEXT: v_interp_p10_f32 v7, v3, v0, v3 wait_exp:2
137 ; GFX11-NEXT: v_interp_p10_f32 v8, v4, v0, v4 wait_exp:1
138 ; GFX11-NEXT: v_interp_p10_f32 v0, v5, v0, v5 wait_exp:0
139 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
140 ; GFX11-NEXT: v_interp_p2_f32 v6, v2, v1, v6 wait_exp:7
141 ; GFX11-NEXT: v_interp_p2_f32 v7, v3, v1, v7 wait_exp:7
142 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
143 ; GFX11-NEXT: v_interp_p2_f32 v8, v4, v1, v8 wait_exp:7
144 ; GFX11-NEXT: v_interp_p2_f32 v0, v5, v1, v0 wait_exp:7
145 ; GFX11-NEXT: exp mrt0 v6, v7, v8, v0 done
146 ; GFX11-NEXT: s_endpgm
148 ; GFX12-LABEL: v_interp_f32_many_vm:
149 ; GFX12: ; %bb.0: ; %main_body
150 ; GFX12-NEXT: global_load_b64 v[0:1], v[0:1], off offset:4
151 ; GFX12-NEXT: s_mov_b32 m0, s0
152 ; GFX12-NEXT: s_mov_b32 s0, exec_lo
153 ; GFX12-NEXT: s_wqm_b32 exec_lo, exec_lo
154 ; GFX12-NEXT: ds_param_load v2, attr0.x wait_va_vdst:15 wait_vm_vsrc:1
155 ; GFX12-NEXT: ds_param_load v3, attr1.x wait_va_vdst:15 wait_vm_vsrc:1
156 ; GFX12-NEXT: ds_param_load v4, attr2.x wait_va_vdst:15 wait_vm_vsrc:1
157 ; GFX12-NEXT: ds_param_load v5, attr3.x wait_va_vdst:15 wait_vm_vsrc:1
158 ; GFX12-NEXT: s_mov_b32 exec_lo, s0
159 ; GFX12-NEXT: s_wait_loadcnt 0x0
160 ; GFX12-NEXT: v_interp_p10_f32 v6, v2, v0, v2 wait_exp:3
161 ; GFX12-NEXT: v_interp_p10_f32 v7, v3, v0, v3 wait_exp:2
162 ; GFX12-NEXT: v_interp_p10_f32 v8, v4, v0, v4 wait_exp:1
163 ; GFX12-NEXT: v_interp_p10_f32 v0, v5, v0, v5 wait_exp:0
164 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
165 ; GFX12-NEXT: v_interp_p2_f32 v6, v2, v1, v6 wait_exp:7
166 ; GFX12-NEXT: v_interp_p2_f32 v7, v3, v1, v7 wait_exp:7
167 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
168 ; GFX12-NEXT: v_interp_p2_f32 v8, v4, v1, v8 wait_exp:7
169 ; GFX12-NEXT: v_interp_p2_f32 v0, v5, v1, v0 wait_exp:7
170 ; GFX12-NEXT: export mrt0 v6, v7, v8, v0 done
171 ; GFX12-NEXT: s_endpgm
173 %i.ptr = getelementptr float, ptr addrspace(1) %ptr, i32 1
174 %i = load float, ptr addrspace(1) %i.ptr, align 4
175 %j.ptr = getelementptr float, ptr addrspace(1) %ptr, i32 2
176 %j = load float, ptr addrspace(1) %j.ptr, align 4
177 %p0 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 0, i32 %m0)
178 %p1 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 1, i32 %m0)
179 %p2 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 2, i32 %m0)
180 %p3 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 3, i32 %m0)
181 %p0_0 = call float @llvm.amdgcn.interp.inreg.p10(float %p0, float %i, float %p0)
182 %p0_1 = call float @llvm.amdgcn.interp.inreg.p2(float %p0, float %j, float %p0_0)
183 %p1_0 = call float @llvm.amdgcn.interp.inreg.p10(float %p1, float %i, float %p1)
184 %p1_1 = call float @llvm.amdgcn.interp.inreg.p2(float %p1, float %j, float %p1_0)
185 %p2_0 = call float @llvm.amdgcn.interp.inreg.p10(float %p2, float %i, float %p2)
186 %p2_1 = call float @llvm.amdgcn.interp.inreg.p2(float %p2, float %j, float %p2_0)
187 %p3_0 = call float @llvm.amdgcn.interp.inreg.p10(float %p3, float %i, float %p3)
188 %p3_1 = call float @llvm.amdgcn.interp.inreg.p2(float %p3, float %j, float %p3_0)
189 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %p0_1, float %p1_1, float %p2_1, float %p3_1, i1 true, i1 true) #0
193 define amdgpu_ps half @v_interp_f16(float inreg %i, float inreg %j, i32 inreg %m0) #0 {
194 ; GFX11-TRUE16-LABEL: v_interp_f16:
195 ; GFX11-TRUE16: ; %bb.0: ; %main_body
196 ; GFX11-TRUE16-NEXT: s_mov_b32 s3, exec_lo
197 ; GFX11-TRUE16-NEXT: s_wqm_b32 exec_lo, exec_lo
198 ; GFX11-TRUE16-NEXT: s_mov_b32 m0, s2
199 ; GFX11-TRUE16-NEXT: lds_param_load v1, attr0.x wait_vdst:15
200 ; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s3
201 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, s0
202 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s1
203 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
204 ; GFX11-TRUE16-NEXT: v_interp_p10_f16_f32 v3, v1.l, v0, v1.l wait_exp:0
205 ; GFX11-TRUE16-NEXT: v_interp_p10_f16_f32 v4, v1.h, v0, v1.h wait_exp:7
206 ; GFX11-TRUE16-NEXT: v_interp_p2_f16_f32 v0.l, v1.l, v2, v3 wait_exp:7
207 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
208 ; GFX11-TRUE16-NEXT: v_interp_p2_f16_f32 v0.h, v1.h, v2, v4 wait_exp:7
209 ; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
210 ; GFX11-TRUE16-NEXT: ; return to shader part epilog
212 ; GFX11-FAKE16-LABEL: v_interp_f16:
213 ; GFX11-FAKE16: ; %bb.0: ; %main_body
214 ; GFX11-FAKE16-NEXT: s_mov_b32 s3, exec_lo
215 ; GFX11-FAKE16-NEXT: s_wqm_b32 exec_lo, exec_lo
216 ; GFX11-FAKE16-NEXT: s_mov_b32 m0, s2
217 ; GFX11-FAKE16-NEXT: lds_param_load v1, attr0.x wait_vdst:15
218 ; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s3
219 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s0
220 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s1
221 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
222 ; GFX11-FAKE16-NEXT: v_interp_p10_f16_f32 v3, v1, v0, v1 wait_exp:0
223 ; GFX11-FAKE16-NEXT: v_interp_p10_f16_f32 v0, v1, v0, v1 op_sel:[1,0,1,0] wait_exp:7
224 ; GFX11-FAKE16-NEXT: v_interp_p2_f16_f32 v3, v1, v2, v3 wait_exp:7
225 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
226 ; GFX11-FAKE16-NEXT: v_interp_p2_f16_f32 v0, v1, v2, v0 op_sel:[1,0,0,0] wait_exp:7
227 ; GFX11-FAKE16-NEXT: v_add_f16_e32 v0, v3, v0
228 ; GFX11-FAKE16-NEXT: ; return to shader part epilog
230 ; GFX12-LABEL: v_interp_f16:
231 ; GFX12: ; %bb.0: ; %main_body
232 ; GFX12-NEXT: s_mov_b32 s3, exec_lo
233 ; GFX12-NEXT: s_wqm_b32 exec_lo, exec_lo
234 ; GFX12-NEXT: s_mov_b32 m0, s2
235 ; GFX12-NEXT: ds_param_load v1, attr0.x wait_va_vdst:15 wait_vm_vsrc:1
236 ; GFX12-NEXT: s_mov_b32 exec_lo, s3
237 ; GFX12-NEXT: v_mov_b32_e32 v0, s0
238 ; GFX12-NEXT: v_mov_b32_e32 v2, s1
239 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
240 ; GFX12-NEXT: v_interp_p10_f16_f32 v3, v1, v0, v1 wait_exp:0
241 ; GFX12-NEXT: v_interp_p10_f16_f32 v0, v1, v0, v1 op_sel:[1,0,1,0] wait_exp:7
242 ; GFX12-NEXT: v_interp_p2_f16_f32 v3, v1, v2, v3 wait_exp:7
243 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
244 ; GFX12-NEXT: v_interp_p2_f16_f32 v0, v1, v2, v0 op_sel:[1,0,0,0] wait_exp:7
245 ; GFX12-NEXT: v_add_f16_e32 v0, v3, v0
246 ; GFX12-NEXT: ; return to shader part epilog
248 %p0 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 0, i32 %m0)
249 %l_p0 = call float @llvm.amdgcn.interp.inreg.p10.f16(float %p0, float %i, float %p0, i1 0)
250 %l_p1 = call half @llvm.amdgcn.interp.inreg.p2.f16(float %p0, float %j, float %l_p0, i1 0)
251 %h_p0 = call float @llvm.amdgcn.interp.inreg.p10.f16(float %p0, float %i, float %p0, i1 1)
252 %h_p1 = call half @llvm.amdgcn.interp.inreg.p2.f16(float %p0, float %j, float %h_p0, i1 1)
253 %res = fadd half %l_p1, %h_p1
257 define amdgpu_ps half @v_interp_rtz_f16(float inreg %i, float inreg %j, i32 inreg %m0) #0 {
258 ; GFX11-TRUE16-LABEL: v_interp_rtz_f16:
259 ; GFX11-TRUE16: ; %bb.0: ; %main_body
260 ; GFX11-TRUE16-NEXT: s_mov_b32 s3, exec_lo
261 ; GFX11-TRUE16-NEXT: s_wqm_b32 exec_lo, exec_lo
262 ; GFX11-TRUE16-NEXT: s_mov_b32 m0, s2
263 ; GFX11-TRUE16-NEXT: lds_param_load v1, attr0.x wait_vdst:15
264 ; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s3
265 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, s0
266 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s1
267 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
268 ; GFX11-TRUE16-NEXT: v_interp_p10_rtz_f16_f32 v3, v1.l, v0, v1.l wait_exp:0
269 ; GFX11-TRUE16-NEXT: v_interp_p10_rtz_f16_f32 v4, v1.h, v0, v1.h wait_exp:7
270 ; GFX11-TRUE16-NEXT: v_interp_p2_rtz_f16_f32 v0.l, v1.l, v2, v3 wait_exp:7
271 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
272 ; GFX11-TRUE16-NEXT: v_interp_p2_rtz_f16_f32 v0.h, v1.h, v2, v4 wait_exp:7
273 ; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
274 ; GFX11-TRUE16-NEXT: ; return to shader part epilog
276 ; GFX11-FAKE16-LABEL: v_interp_rtz_f16:
277 ; GFX11-FAKE16: ; %bb.0: ; %main_body
278 ; GFX11-FAKE16-NEXT: s_mov_b32 s3, exec_lo
279 ; GFX11-FAKE16-NEXT: s_wqm_b32 exec_lo, exec_lo
280 ; GFX11-FAKE16-NEXT: s_mov_b32 m0, s2
281 ; GFX11-FAKE16-NEXT: lds_param_load v1, attr0.x wait_vdst:15
282 ; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s3
283 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s0
284 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s1
285 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
286 ; GFX11-FAKE16-NEXT: v_interp_p10_rtz_f16_f32 v3, v1, v0, v1 wait_exp:0
287 ; GFX11-FAKE16-NEXT: v_interp_p10_rtz_f16_f32 v0, v1, v0, v1 op_sel:[1,0,1,0] wait_exp:7
288 ; GFX11-FAKE16-NEXT: v_interp_p2_rtz_f16_f32 v3, v1, v2, v3 wait_exp:7
289 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
290 ; GFX11-FAKE16-NEXT: v_interp_p2_rtz_f16_f32 v0, v1, v2, v0 op_sel:[1,0,0,0] wait_exp:7
291 ; GFX11-FAKE16-NEXT: v_add_f16_e32 v0, v3, v0
292 ; GFX11-FAKE16-NEXT: ; return to shader part epilog
294 ; GFX12-LABEL: v_interp_rtz_f16:
295 ; GFX12: ; %bb.0: ; %main_body
296 ; GFX12-NEXT: s_mov_b32 s3, exec_lo
297 ; GFX12-NEXT: s_wqm_b32 exec_lo, exec_lo
298 ; GFX12-NEXT: s_mov_b32 m0, s2
299 ; GFX12-NEXT: ds_param_load v1, attr0.x wait_va_vdst:15 wait_vm_vsrc:1
300 ; GFX12-NEXT: s_mov_b32 exec_lo, s3
301 ; GFX12-NEXT: v_mov_b32_e32 v0, s0
302 ; GFX12-NEXT: v_mov_b32_e32 v2, s1
303 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
304 ; GFX12-NEXT: v_interp_p10_rtz_f16_f32 v3, v1, v0, v1 wait_exp:0
305 ; GFX12-NEXT: v_interp_p10_rtz_f16_f32 v0, v1, v0, v1 op_sel:[1,0,1,0] wait_exp:7
306 ; GFX12-NEXT: v_interp_p2_rtz_f16_f32 v3, v1, v2, v3 wait_exp:7
307 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
308 ; GFX12-NEXT: v_interp_p2_rtz_f16_f32 v0, v1, v2, v0 op_sel:[1,0,0,0] wait_exp:7
309 ; GFX12-NEXT: v_add_f16_e32 v0, v3, v0
310 ; GFX12-NEXT: ; return to shader part epilog
312 %p0 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 0, i32 %m0)
313 %l_p0 = call float @llvm.amdgcn.interp.p10.rtz.f16(float %p0, float %i, float %p0, i1 0)
314 %l_p1 = call half @llvm.amdgcn.interp.p2.rtz.f16(float %p0, float %j, float %l_p0, i1 0)
315 %h_p0 = call float @llvm.amdgcn.interp.p10.rtz.f16(float %p0, float %i, float %p0, i1 1)
316 %h_p1 = call half @llvm.amdgcn.interp.p2.rtz.f16(float %p0, float %j, float %h_p0, i1 1)
317 %res = fadd half %l_p1, %h_p1
321 define amdgpu_ps half @v_interp_f16_imm_params(float inreg %i, float inreg %j) #0 {
322 ; GFX11-TRUE16-LABEL: v_interp_f16_imm_params:
323 ; GFX11-TRUE16: ; %bb.0: ; %main_body
324 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0
325 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, 0
326 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s1
327 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
328 ; GFX11-TRUE16-NEXT: v_interp_p10_f16_f32 v1, v0.l, v1, v0.l wait_exp:7
329 ; GFX11-TRUE16-NEXT: v_interp_p2_f16_f32 v0.l, v0.l, v3, v2 wait_exp:7
330 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
331 ; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v1
332 ; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.h, v0.l
333 ; GFX11-TRUE16-NEXT: ; return to shader part epilog
335 ; GFX11-FAKE16-LABEL: v_interp_f16_imm_params:
336 ; GFX11-FAKE16: ; %bb.0: ; %main_body
337 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
338 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s1
339 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
340 ; GFX11-FAKE16-NEXT: v_interp_p10_f16_f32 v1, v0, v1, v0 wait_exp:7
341 ; GFX11-FAKE16-NEXT: v_interp_p2_f16_f32 v0, v0, v2, v0 wait_exp:7
342 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
343 ; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v1, v1
344 ; GFX11-FAKE16-NEXT: v_add_f16_e32 v0, v1, v0
345 ; GFX11-FAKE16-NEXT: ; return to shader part epilog
347 ; GFX12-LABEL: v_interp_f16_imm_params:
348 ; GFX12: ; %bb.0: ; %main_body
349 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
350 ; GFX12-NEXT: v_mov_b32_e32 v2, s1
351 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
352 ; GFX12-NEXT: v_interp_p10_f16_f32 v1, v0, v1, v0 wait_exp:7
353 ; GFX12-NEXT: v_interp_p2_f16_f32 v0, v0, v2, v0 wait_exp:7
354 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
355 ; GFX12-NEXT: v_cvt_f16_f32_e32 v1, v1
356 ; GFX12-NEXT: v_add_f16_e32 v0, v1, v0
357 ; GFX12-NEXT: ; return to shader part epilog
359 %l_p0 = call float @llvm.amdgcn.interp.inreg.p10.f16(float 0.0, float %i, float 0.0, i1 0)
360 %l_p1 = call half @llvm.amdgcn.interp.inreg.p2.f16(float 0.0, float %j, float 0.0, i1 0)
361 %h = fptrunc float %l_p0 to half
362 %res = fadd half %h, %l_p1
366 declare float @llvm.amdgcn.lds.param.load(i32, i32, i32) #1
367 declare float @llvm.amdgcn.interp.inreg.p10(float, float, float) #0
368 declare float @llvm.amdgcn.interp.inreg.p2(float, float, float) #0
369 declare float @llvm.amdgcn.interp.inreg.p10.f16(float, float, float, i1) #0
370 declare half @llvm.amdgcn.interp.inreg.p2.f16(float, float, float, i1) #0
371 declare float @llvm.amdgcn.interp.p10.rtz.f16(float, float, float, i1) #0
372 declare half @llvm.amdgcn.interp.p2.rtz.f16(float, float, float, i1) #0
373 declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
374 declare void @llvm.amdgcn.exp.f16(i32, i32, float, float, float, float, i1, i1) #0
376 attributes #0 = { nounwind }
377 attributes #1 = { nounwind readnone }