1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10,GFX1030 %s
3 ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1013 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10,GFX1013 %s
4 ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX11 %s
5 ; RUN: not --crash llc -global-isel -march=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s -o /dev/null 2>&1 | FileCheck -check-prefix=ERR %s
7 ; uint4 llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(uint node_ptr, float ray_extent, float3 ray_origin, float3 ray_dir, float3 ray_inv_dir, uint4 texture_descr)
8 ; uint4 llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(uint node_ptr, float ray_extent, float3 ray_origin, half3 ray_dir, half3 ray_inv_dir, uint4 texture_descr)
9 ; uint4 llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(ulong node_ptr, float ray_extent, float3 ray_origin, float3 ray_dir, float3 ray_inv_dir, uint4 texture_descr)
10 ; uint4 llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(ulong node_ptr, float ray_extent, float3 ray_origin, half3 ray_dir, half3 ray_inv_dir, uint4 texture_descr)
12 declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32, float, <3 x float>, <3 x float>, <3 x float>, <4 x i32>)
13 declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32, float, <3 x float>, <3 x half>, <3 x half>, <4 x i32>)
14 declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64, float, <3 x float>, <3 x float>, <3 x float>, <4 x i32>)
15 declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64, float, <3 x float>, <3 x half>, <3 x half>, <4 x i32>)
16 declare i32 @llvm.amdgcn.workitem.id.x()
18 define amdgpu_ps <4 x float> @image_bvh_intersect_ray(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> inreg %tdescr) {
19 ; GCN-LABEL: image_bvh_intersect_ray:
21 ; GCN-NEXT: image_bvh_intersect_ray v[0:3], v[0:10], s[0:3]
22 ; GCN-NEXT: s_waitcnt vmcnt(0)
23 ; GCN-NEXT: ; return to shader part epilog
24 ; ERR: in function image_bvh_intersect_ray{{.*}}intrinsic not supported on subtarget
25 %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr)
26 %r = bitcast <4 x i32> %v to <4 x float>
30 define amdgpu_ps <4 x float> @image_bvh_intersect_ray_flat(i32 %node_ptr, float %ray_extent, float %ray_origin_x, float %ray_origin_y, float %ray_origin_z, float %ray_dir_x, float %ray_dir_y, float %ray_dir_z, float %ray_inv_dir_x, float %ray_inv_dir_y, float %ray_inv_dir_z, <4 x i32> inreg %tdescr) {
31 ; GCN-LABEL: image_bvh_intersect_ray_flat:
33 ; GCN-NEXT: image_bvh_intersect_ray v[0:3], v[0:10], s[0:3]
34 ; GCN-NEXT: s_waitcnt vmcnt(0)
35 ; GCN-NEXT: ; return to shader part epilog
36 %ray_origin0 = insertelement <3 x float> undef, float %ray_origin_x, i32 0
37 %ray_origin1 = insertelement <3 x float> %ray_origin0, float %ray_origin_y, i32 1
38 %ray_origin = insertelement <3 x float> %ray_origin1, float %ray_origin_z, i32 2
39 %ray_dir0 = insertelement <3 x float> undef, float %ray_dir_x, i32 0
40 %ray_dir1 = insertelement <3 x float> %ray_dir0, float %ray_dir_y, i32 1
41 %ray_dir = insertelement <3 x float> %ray_dir1, float %ray_dir_z, i32 2
42 %ray_inv_dir0 = insertelement <3 x float> undef, float %ray_inv_dir_x, i32 0
43 %ray_inv_dir1 = insertelement <3 x float> %ray_inv_dir0, float %ray_inv_dir_y, i32 1
44 %ray_inv_dir = insertelement <3 x float> %ray_inv_dir1, float %ray_inv_dir_z, i32 2
45 %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr)
46 %r = bitcast <4 x i32> %v to <4 x float>
50 define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> inreg %tdescr) {
51 ; GFX10-LABEL: image_bvh_intersect_ray_a16:
53 ; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v5
54 ; GFX10-NEXT: v_and_b32_e32 v10, 0xffff, v7
55 ; GFX10-NEXT: v_and_b32_e32 v8, 0xffff, v8
56 ; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v9
57 ; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v10
58 ; GFX10-NEXT: v_alignbit_b32 v7, v8, v7, 16
59 ; GFX10-NEXT: v_and_or_b32 v5, v5, 0xffff, v9
60 ; GFX10-NEXT: v_and_or_b32 v6, v6, 0xffff, v10
61 ; GFX10-NEXT: image_bvh_intersect_ray v[0:3], v[0:7], s[0:3] a16
62 ; GFX10-NEXT: s_waitcnt vmcnt(0)
63 ; GFX10-NEXT: ; return to shader part epilog
65 ; GFX11-LABEL: image_bvh_intersect_ray_a16:
67 ; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v7
68 ; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v8
69 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
70 ; GFX11-NEXT: v_lshl_or_b32 v8, v5, 16, v9
71 ; GFX11-NEXT: v_perm_b32 v9, v5, v7, 0x7060302
72 ; GFX11-NEXT: v_lshl_or_b32 v10, v6, 16, v10
73 ; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v0, v1, v[2:4], v[8:10]], s[0:3] a16
74 ; GFX11-NEXT: s_waitcnt vmcnt(0)
75 ; GFX11-NEXT: ; return to shader part epilog
76 %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr)
77 %r = bitcast <4 x i32> %v to <4 x float>
81 define amdgpu_ps <4 x float> @image_bvh64_intersect_ray(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> inreg %tdescr) {
82 ; GCN-LABEL: image_bvh64_intersect_ray:
84 ; GCN-NEXT: image_bvh64_intersect_ray v[0:3], v[0:11], s[0:3]
85 ; GCN-NEXT: s_waitcnt vmcnt(0)
86 ; GCN-NEXT: ; return to shader part epilog
87 %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr)
88 %r = bitcast <4 x i32> %v to <4 x float>
92 define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_flat(<2 x i32> %node_ptr_vec, float %ray_extent, float %ray_origin_x, float %ray_origin_y, float %ray_origin_z, float %ray_dir_x, float %ray_dir_y, float %ray_dir_z, float %ray_inv_dir_x, float %ray_inv_dir_y, float %ray_inv_dir_z, <4 x i32> inreg %tdescr) {
93 ; GCN-LABEL: image_bvh64_intersect_ray_flat:
95 ; GCN-NEXT: image_bvh64_intersect_ray v[0:3], v[0:11], s[0:3]
96 ; GCN-NEXT: s_waitcnt vmcnt(0)
97 ; GCN-NEXT: ; return to shader part epilog
98 %node_ptr = bitcast <2 x i32> %node_ptr_vec to i64
99 %ray_origin0 = insertelement <3 x float> undef, float %ray_origin_x, i32 0
100 %ray_origin1 = insertelement <3 x float> %ray_origin0, float %ray_origin_y, i32 1
101 %ray_origin = insertelement <3 x float> %ray_origin1, float %ray_origin_z, i32 2
102 %ray_dir0 = insertelement <3 x float> undef, float %ray_dir_x, i32 0
103 %ray_dir1 = insertelement <3 x float> %ray_dir0, float %ray_dir_y, i32 1
104 %ray_dir = insertelement <3 x float> %ray_dir1, float %ray_dir_z, i32 2
105 %ray_inv_dir0 = insertelement <3 x float> undef, float %ray_inv_dir_x, i32 0
106 %ray_inv_dir1 = insertelement <3 x float> %ray_inv_dir0, float %ray_inv_dir_y, i32 1
107 %ray_inv_dir = insertelement <3 x float> %ray_inv_dir1, float %ray_inv_dir_z, i32 2
108 %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr)
109 %r = bitcast <4 x i32> %v to <4 x float>
113 define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> inreg %tdescr) {
114 ; GFX10-LABEL: image_bvh64_intersect_ray_a16:
116 ; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v6
117 ; GFX10-NEXT: v_and_b32_e32 v11, 0xffff, v8
118 ; GFX10-NEXT: v_and_b32_e32 v9, 0xffff, v9
119 ; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v10
120 ; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v11
121 ; GFX10-NEXT: v_alignbit_b32 v8, v9, v8, 16
122 ; GFX10-NEXT: v_and_or_b32 v6, v6, 0xffff, v10
123 ; GFX10-NEXT: v_and_or_b32 v7, v7, 0xffff, v11
124 ; GFX10-NEXT: image_bvh64_intersect_ray v[0:3], v[0:8], s[0:3] a16
125 ; GFX10-NEXT: s_waitcnt vmcnt(0)
126 ; GFX10-NEXT: ; return to shader part epilog
128 ; GFX11-LABEL: image_bvh64_intersect_ray_a16:
130 ; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v8
131 ; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v9
132 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
133 ; GFX11-NEXT: v_lshl_or_b32 v9, v6, 16, v10
134 ; GFX11-NEXT: v_perm_b32 v10, v6, v8, 0x7060302
135 ; GFX11-NEXT: v_lshl_or_b32 v11, v7, 16, v11
136 ; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[0:1], v2, v[3:5], v[9:11]], s[0:3] a16
137 ; GFX11-NEXT: s_waitcnt vmcnt(0)
138 ; GFX11-NEXT: ; return to shader part epilog
139 %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr)
140 %r = bitcast <4 x i32> %v to <4 x float>
144 define amdgpu_ps <4 x float> @image_bvh_intersect_ray_vgpr_descr(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr) {
145 ; GFX1030-LABEL: image_bvh_intersect_ray_vgpr_descr:
147 ; GFX1030-NEXT: v_mov_b32_e32 v15, v0
148 ; GFX1030-NEXT: v_mov_b32_e32 v16, v1
149 ; GFX1030-NEXT: v_mov_b32_e32 v17, v2
150 ; GFX1030-NEXT: v_mov_b32_e32 v18, v3
151 ; GFX1030-NEXT: v_mov_b32_e32 v19, v4
152 ; GFX1030-NEXT: v_mov_b32_e32 v20, v5
153 ; GFX1030-NEXT: v_mov_b32_e32 v21, v6
154 ; GFX1030-NEXT: v_mov_b32_e32 v22, v7
155 ; GFX1030-NEXT: v_mov_b32_e32 v23, v8
156 ; GFX1030-NEXT: v_mov_b32_e32 v24, v9
157 ; GFX1030-NEXT: v_mov_b32_e32 v25, v10
158 ; GFX1030-NEXT: s_mov_b32 s1, exec_lo
159 ; GFX1030-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1
160 ; GFX1030-NEXT: v_readfirstlane_b32 s4, v11
161 ; GFX1030-NEXT: v_readfirstlane_b32 s5, v12
162 ; GFX1030-NEXT: v_readfirstlane_b32 s6, v13
163 ; GFX1030-NEXT: v_readfirstlane_b32 s7, v14
164 ; GFX1030-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[11:12]
165 ; GFX1030-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[13:14]
166 ; GFX1030-NEXT: s_and_b32 s0, vcc_lo, s0
167 ; GFX1030-NEXT: s_and_saveexec_b32 s0, s0
168 ; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], v[15:25], s[4:7]
169 ; GFX1030-NEXT: ; implicit-def: $vgpr11
170 ; GFX1030-NEXT: ; implicit-def: $vgpr15
171 ; GFX1030-NEXT: ; implicit-def: $vgpr16
172 ; GFX1030-NEXT: ; implicit-def: $vgpr17
173 ; GFX1030-NEXT: ; implicit-def: $vgpr18
174 ; GFX1030-NEXT: ; implicit-def: $vgpr19
175 ; GFX1030-NEXT: ; implicit-def: $vgpr20
176 ; GFX1030-NEXT: ; implicit-def: $vgpr21
177 ; GFX1030-NEXT: ; implicit-def: $vgpr22
178 ; GFX1030-NEXT: ; implicit-def: $vgpr23
179 ; GFX1030-NEXT: ; implicit-def: $vgpr24
180 ; GFX1030-NEXT: ; implicit-def: $vgpr25
181 ; GFX1030-NEXT: ; implicit-def: $vgpr11_vgpr12_vgpr13_vgpr14
182 ; GFX1030-NEXT: s_xor_b32 exec_lo, exec_lo, s0
183 ; GFX1030-NEXT: s_cbranch_execnz .LBB6_1
184 ; GFX1030-NEXT: ; %bb.2:
185 ; GFX1030-NEXT: s_mov_b32 exec_lo, s1
186 ; GFX1030-NEXT: s_waitcnt vmcnt(0)
187 ; GFX1030-NEXT: ; return to shader part epilog
189 ; GFX1013-LABEL: image_bvh_intersect_ray_vgpr_descr:
191 ; GFX1013-NEXT: s_mov_b32 s1, exec_lo
192 ; GFX1013-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1
193 ; GFX1013-NEXT: v_readfirstlane_b32 s4, v11
194 ; GFX1013-NEXT: v_readfirstlane_b32 s5, v12
195 ; GFX1013-NEXT: v_readfirstlane_b32 s6, v13
196 ; GFX1013-NEXT: v_readfirstlane_b32 s7, v14
197 ; GFX1013-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[11:12]
198 ; GFX1013-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[13:14]
199 ; GFX1013-NEXT: s_and_b32 s0, vcc_lo, s0
200 ; GFX1013-NEXT: s_and_saveexec_b32 s0, s0
201 ; GFX1013-NEXT: image_bvh_intersect_ray v[15:18], v[0:10], s[4:7]
202 ; GFX1013-NEXT: ; implicit-def: $vgpr11
203 ; GFX1013-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10
204 ; GFX1013-NEXT: ; implicit-def: $vgpr11_vgpr12_vgpr13_vgpr14
205 ; GFX1013-NEXT: s_waitcnt_depctr 0xffe3
206 ; GFX1013-NEXT: s_xor_b32 exec_lo, exec_lo, s0
207 ; GFX1013-NEXT: s_cbranch_execnz .LBB6_1
208 ; GFX1013-NEXT: ; %bb.2:
209 ; GFX1013-NEXT: s_mov_b32 exec_lo, s1
210 ; GFX1013-NEXT: s_waitcnt vmcnt(0)
211 ; GFX1013-NEXT: v_mov_b32_e32 v0, v15
212 ; GFX1013-NEXT: v_mov_b32_e32 v1, v16
213 ; GFX1013-NEXT: v_mov_b32_e32 v2, v17
214 ; GFX1013-NEXT: v_mov_b32_e32 v3, v18
215 ; GFX1013-NEXT: ; return to shader part epilog
217 ; GFX11-LABEL: image_bvh_intersect_ray_vgpr_descr:
219 ; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v19, v1
220 ; GFX11-NEXT: v_dual_mov_b32 v15, v2 :: v_dual_mov_b32 v16, v3
221 ; GFX11-NEXT: v_mov_b32_e32 v17, v4
222 ; GFX11-NEXT: s_mov_b32 s1, exec_lo
223 ; GFX11-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1
224 ; GFX11-NEXT: v_readfirstlane_b32 s4, v11
225 ; GFX11-NEXT: v_readfirstlane_b32 s5, v12
226 ; GFX11-NEXT: v_readfirstlane_b32 s6, v13
227 ; GFX11-NEXT: v_readfirstlane_b32 s7, v14
228 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
229 ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[11:12]
230 ; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[13:14]
231 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
232 ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
233 ; GFX11-NEXT: s_and_saveexec_b32 s0, s0
234 ; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v18, v19, v[15:17], v[5:7], v[8:10]], s[4:7]
235 ; GFX11-NEXT: ; implicit-def: $vgpr11
236 ; GFX11-NEXT: ; implicit-def: $vgpr18
237 ; GFX11-NEXT: ; implicit-def: $vgpr19
238 ; GFX11-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17
239 ; GFX11-NEXT: ; implicit-def: $vgpr5_vgpr6_vgpr7
240 ; GFX11-NEXT: ; implicit-def: $vgpr8_vgpr9_vgpr10
241 ; GFX11-NEXT: ; implicit-def: $vgpr11_vgpr12_vgpr13_vgpr14
242 ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
243 ; GFX11-NEXT: s_cbranch_execnz .LBB6_1
244 ; GFX11-NEXT: ; %bb.2:
245 ; GFX11-NEXT: s_mov_b32 exec_lo, s1
246 ; GFX11-NEXT: s_waitcnt vmcnt(0)
247 ; GFX11-NEXT: ; return to shader part epilog
248 %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr)
249 %r = bitcast <4 x i32> %v to <4 x float>
253 define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16_vgpr_descr(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr) {
254 ; GFX1030-LABEL: image_bvh_intersect_ray_a16_vgpr_descr:
256 ; GFX1030-NEXT: v_mov_b32_e32 v13, v0
257 ; GFX1030-NEXT: v_mov_b32_e32 v14, v1
258 ; GFX1030-NEXT: v_lshrrev_b32_e32 v0, 16, v5
259 ; GFX1030-NEXT: v_and_b32_e32 v1, 0xffff, v7
260 ; GFX1030-NEXT: v_mov_b32_e32 v15, v2
261 ; GFX1030-NEXT: v_and_b32_e32 v2, 0xffff, v8
262 ; GFX1030-NEXT: v_mov_b32_e32 v16, v3
263 ; GFX1030-NEXT: v_lshlrev_b32_e32 v0, 16, v0
264 ; GFX1030-NEXT: v_lshlrev_b32_e32 v1, 16, v1
265 ; GFX1030-NEXT: v_mov_b32_e32 v17, v4
266 ; GFX1030-NEXT: v_alignbit_b32 v20, v2, v7, 16
267 ; GFX1030-NEXT: s_mov_b32 s1, exec_lo
268 ; GFX1030-NEXT: v_and_or_b32 v18, v5, 0xffff, v0
269 ; GFX1030-NEXT: v_and_or_b32 v19, v6, 0xffff, v1
270 ; GFX1030-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
271 ; GFX1030-NEXT: v_readfirstlane_b32 s4, v9
272 ; GFX1030-NEXT: v_readfirstlane_b32 s5, v10
273 ; GFX1030-NEXT: v_readfirstlane_b32 s6, v11
274 ; GFX1030-NEXT: v_readfirstlane_b32 s7, v12
275 ; GFX1030-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
276 ; GFX1030-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[11:12]
277 ; GFX1030-NEXT: s_and_b32 s0, vcc_lo, s0
278 ; GFX1030-NEXT: s_and_saveexec_b32 s0, s0
279 ; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], v[13:20], s[4:7] a16
280 ; GFX1030-NEXT: ; implicit-def: $vgpr9
281 ; GFX1030-NEXT: ; implicit-def: $vgpr13
282 ; GFX1030-NEXT: ; implicit-def: $vgpr14
283 ; GFX1030-NEXT: ; implicit-def: $vgpr15
284 ; GFX1030-NEXT: ; implicit-def: $vgpr16
285 ; GFX1030-NEXT: ; implicit-def: $vgpr17
286 ; GFX1030-NEXT: ; implicit-def: $vgpr18
287 ; GFX1030-NEXT: ; implicit-def: $vgpr19
288 ; GFX1030-NEXT: ; implicit-def: $vgpr20
289 ; GFX1030-NEXT: ; implicit-def: $vgpr9_vgpr10_vgpr11_vgpr12
290 ; GFX1030-NEXT: s_xor_b32 exec_lo, exec_lo, s0
291 ; GFX1030-NEXT: s_cbranch_execnz .LBB7_1
292 ; GFX1030-NEXT: ; %bb.2:
293 ; GFX1030-NEXT: s_mov_b32 exec_lo, s1
294 ; GFX1030-NEXT: s_waitcnt vmcnt(0)
295 ; GFX1030-NEXT: ; return to shader part epilog
297 ; GFX1013-LABEL: image_bvh_intersect_ray_a16_vgpr_descr:
299 ; GFX1013-NEXT: v_lshrrev_b32_e32 v13, 16, v5
300 ; GFX1013-NEXT: v_and_b32_e32 v14, 0xffff, v7
301 ; GFX1013-NEXT: v_and_b32_e32 v8, 0xffff, v8
302 ; GFX1013-NEXT: s_mov_b32 s1, exec_lo
303 ; GFX1013-NEXT: v_lshlrev_b32_e32 v13, 16, v13
304 ; GFX1013-NEXT: v_lshlrev_b32_e32 v14, 16, v14
305 ; GFX1013-NEXT: v_alignbit_b32 v7, v8, v7, 16
306 ; GFX1013-NEXT: v_and_or_b32 v5, v5, 0xffff, v13
307 ; GFX1013-NEXT: v_and_or_b32 v6, v6, 0xffff, v14
308 ; GFX1013-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
309 ; GFX1013-NEXT: v_readfirstlane_b32 s4, v9
310 ; GFX1013-NEXT: v_readfirstlane_b32 s5, v10
311 ; GFX1013-NEXT: v_readfirstlane_b32 s6, v11
312 ; GFX1013-NEXT: v_readfirstlane_b32 s7, v12
313 ; GFX1013-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
314 ; GFX1013-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[11:12]
315 ; GFX1013-NEXT: s_and_b32 s0, vcc_lo, s0
316 ; GFX1013-NEXT: s_and_saveexec_b32 s0, s0
317 ; GFX1013-NEXT: image_bvh_intersect_ray v[13:16], v[0:7], s[4:7] a16
318 ; GFX1013-NEXT: ; implicit-def: $vgpr9
319 ; GFX1013-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
320 ; GFX1013-NEXT: ; implicit-def: $vgpr9_vgpr10_vgpr11_vgpr12
321 ; GFX1013-NEXT: s_waitcnt_depctr 0xffe3
322 ; GFX1013-NEXT: s_xor_b32 exec_lo, exec_lo, s0
323 ; GFX1013-NEXT: s_cbranch_execnz .LBB7_1
324 ; GFX1013-NEXT: ; %bb.2:
325 ; GFX1013-NEXT: s_mov_b32 exec_lo, s1
326 ; GFX1013-NEXT: s_waitcnt vmcnt(0)
327 ; GFX1013-NEXT: v_mov_b32_e32 v0, v13
328 ; GFX1013-NEXT: v_mov_b32_e32 v1, v14
329 ; GFX1013-NEXT: v_mov_b32_e32 v2, v15
330 ; GFX1013-NEXT: v_mov_b32_e32 v3, v16
331 ; GFX1013-NEXT: ; return to shader part epilog
333 ; GFX11-LABEL: image_bvh_intersect_ray_a16_vgpr_descr:
335 ; GFX11-NEXT: v_dual_mov_b32 v16, v0 :: v_dual_mov_b32 v17, v1
336 ; GFX11-NEXT: v_dual_mov_b32 v15, v4 :: v_dual_and_b32 v0, 0xffff, v7
337 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v8
338 ; GFX11-NEXT: v_dual_mov_b32 v13, v2 :: v_dual_mov_b32 v14, v3
339 ; GFX11-NEXT: s_mov_b32 s1, exec_lo
340 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
341 ; GFX11-NEXT: v_lshl_or_b32 v4, v5, 16, v0
342 ; GFX11-NEXT: v_perm_b32 v5, v5, v7, 0x7060302
343 ; GFX11-NEXT: v_lshl_or_b32 v6, v6, 16, v1
344 ; GFX11-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
345 ; GFX11-NEXT: v_readfirstlane_b32 s4, v9
346 ; GFX11-NEXT: v_readfirstlane_b32 s5, v10
347 ; GFX11-NEXT: v_readfirstlane_b32 s6, v11
348 ; GFX11-NEXT: v_readfirstlane_b32 s7, v12
349 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
350 ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
351 ; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[11:12]
352 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
353 ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
354 ; GFX11-NEXT: s_and_saveexec_b32 s0, s0
355 ; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v16, v17, v[13:15], v[4:6]], s[4:7] a16
356 ; GFX11-NEXT: ; implicit-def: $vgpr9
357 ; GFX11-NEXT: ; implicit-def: $vgpr16
358 ; GFX11-NEXT: ; implicit-def: $vgpr17
359 ; GFX11-NEXT: ; implicit-def: $vgpr13_vgpr14_vgpr15
360 ; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6
361 ; GFX11-NEXT: ; implicit-def: $vgpr9_vgpr10_vgpr11_vgpr12
362 ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
363 ; GFX11-NEXT: s_cbranch_execnz .LBB7_1
364 ; GFX11-NEXT: ; %bb.2:
365 ; GFX11-NEXT: s_mov_b32 exec_lo, s1
366 ; GFX11-NEXT: s_waitcnt vmcnt(0)
367 ; GFX11-NEXT: ; return to shader part epilog
368 %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr)
369 %r = bitcast <4 x i32> %v to <4 x float>
373 define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_vgpr_descr(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr) {
374 ; GFX1030-LABEL: image_bvh64_intersect_ray_vgpr_descr:
376 ; GFX1030-NEXT: v_mov_b32_e32 v16, v0
377 ; GFX1030-NEXT: v_mov_b32_e32 v17, v1
378 ; GFX1030-NEXT: v_mov_b32_e32 v18, v2
379 ; GFX1030-NEXT: v_mov_b32_e32 v19, v3
380 ; GFX1030-NEXT: v_mov_b32_e32 v20, v4
381 ; GFX1030-NEXT: v_mov_b32_e32 v21, v5
382 ; GFX1030-NEXT: v_mov_b32_e32 v22, v6
383 ; GFX1030-NEXT: v_mov_b32_e32 v23, v7
384 ; GFX1030-NEXT: v_mov_b32_e32 v24, v8
385 ; GFX1030-NEXT: v_mov_b32_e32 v25, v9
386 ; GFX1030-NEXT: v_mov_b32_e32 v26, v10
387 ; GFX1030-NEXT: v_mov_b32_e32 v27, v11
388 ; GFX1030-NEXT: s_mov_b32 s1, exec_lo
389 ; GFX1030-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
390 ; GFX1030-NEXT: v_readfirstlane_b32 s4, v12
391 ; GFX1030-NEXT: v_readfirstlane_b32 s5, v13
392 ; GFX1030-NEXT: v_readfirstlane_b32 s6, v14
393 ; GFX1030-NEXT: v_readfirstlane_b32 s7, v15
394 ; GFX1030-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[12:13]
395 ; GFX1030-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[14:15]
396 ; GFX1030-NEXT: s_and_b32 s0, vcc_lo, s0
397 ; GFX1030-NEXT: s_and_saveexec_b32 s0, s0
398 ; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[16:27], s[4:7]
399 ; GFX1030-NEXT: ; implicit-def: $vgpr12
400 ; GFX1030-NEXT: ; implicit-def: $vgpr16
401 ; GFX1030-NEXT: ; implicit-def: $vgpr17
402 ; GFX1030-NEXT: ; implicit-def: $vgpr18
403 ; GFX1030-NEXT: ; implicit-def: $vgpr19
404 ; GFX1030-NEXT: ; implicit-def: $vgpr20
405 ; GFX1030-NEXT: ; implicit-def: $vgpr21
406 ; GFX1030-NEXT: ; implicit-def: $vgpr22
407 ; GFX1030-NEXT: ; implicit-def: $vgpr23
408 ; GFX1030-NEXT: ; implicit-def: $vgpr24
409 ; GFX1030-NEXT: ; implicit-def: $vgpr25
410 ; GFX1030-NEXT: ; implicit-def: $vgpr26
411 ; GFX1030-NEXT: ; implicit-def: $vgpr27
412 ; GFX1030-NEXT: ; implicit-def: $vgpr12_vgpr13_vgpr14_vgpr15
413 ; GFX1030-NEXT: s_xor_b32 exec_lo, exec_lo, s0
414 ; GFX1030-NEXT: s_cbranch_execnz .LBB8_1
415 ; GFX1030-NEXT: ; %bb.2:
416 ; GFX1030-NEXT: s_mov_b32 exec_lo, s1
417 ; GFX1030-NEXT: s_waitcnt vmcnt(0)
418 ; GFX1030-NEXT: ; return to shader part epilog
420 ; GFX1013-LABEL: image_bvh64_intersect_ray_vgpr_descr:
422 ; GFX1013-NEXT: s_mov_b32 s1, exec_lo
423 ; GFX1013-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
424 ; GFX1013-NEXT: v_readfirstlane_b32 s4, v12
425 ; GFX1013-NEXT: v_readfirstlane_b32 s5, v13
426 ; GFX1013-NEXT: v_readfirstlane_b32 s6, v14
427 ; GFX1013-NEXT: v_readfirstlane_b32 s7, v15
428 ; GFX1013-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[12:13]
429 ; GFX1013-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[14:15]
430 ; GFX1013-NEXT: s_and_b32 s0, vcc_lo, s0
431 ; GFX1013-NEXT: s_and_saveexec_b32 s0, s0
432 ; GFX1013-NEXT: image_bvh64_intersect_ray v[16:19], v[0:11], s[4:7]
433 ; GFX1013-NEXT: ; implicit-def: $vgpr12
434 ; GFX1013-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11
435 ; GFX1013-NEXT: ; implicit-def: $vgpr12_vgpr13_vgpr14_vgpr15
436 ; GFX1013-NEXT: s_waitcnt_depctr 0xffe3
437 ; GFX1013-NEXT: s_xor_b32 exec_lo, exec_lo, s0
438 ; GFX1013-NEXT: s_cbranch_execnz .LBB8_1
439 ; GFX1013-NEXT: ; %bb.2:
440 ; GFX1013-NEXT: s_mov_b32 exec_lo, s1
441 ; GFX1013-NEXT: s_waitcnt vmcnt(0)
442 ; GFX1013-NEXT: v_mov_b32_e32 v0, v16
443 ; GFX1013-NEXT: v_mov_b32_e32 v1, v17
444 ; GFX1013-NEXT: v_mov_b32_e32 v2, v18
445 ; GFX1013-NEXT: v_mov_b32_e32 v3, v19
446 ; GFX1013-NEXT: ; return to shader part epilog
448 ; GFX11-LABEL: image_bvh64_intersect_ray_vgpr_descr:
450 ; GFX11-NEXT: v_dual_mov_b32 v19, v0 :: v_dual_mov_b32 v20, v1
451 ; GFX11-NEXT: v_dual_mov_b32 v21, v2 :: v_dual_mov_b32 v16, v3
452 ; GFX11-NEXT: v_dual_mov_b32 v17, v4 :: v_dual_mov_b32 v18, v5
453 ; GFX11-NEXT: s_mov_b32 s1, exec_lo
454 ; GFX11-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
455 ; GFX11-NEXT: v_readfirstlane_b32 s4, v12
456 ; GFX11-NEXT: v_readfirstlane_b32 s5, v13
457 ; GFX11-NEXT: v_readfirstlane_b32 s6, v14
458 ; GFX11-NEXT: v_readfirstlane_b32 s7, v15
459 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
460 ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[12:13]
461 ; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[14:15]
462 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
463 ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
464 ; GFX11-NEXT: s_and_saveexec_b32 s0, s0
465 ; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[19:20], v21, v[16:18], v[6:8], v[9:11]], s[4:7]
466 ; GFX11-NEXT: ; implicit-def: $vgpr12
467 ; GFX11-NEXT: ; implicit-def: $vgpr19_vgpr20
468 ; GFX11-NEXT: ; implicit-def: $vgpr21
469 ; GFX11-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18
470 ; GFX11-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8
471 ; GFX11-NEXT: ; implicit-def: $vgpr9_vgpr10_vgpr11
472 ; GFX11-NEXT: ; implicit-def: $vgpr12_vgpr13_vgpr14_vgpr15
473 ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
474 ; GFX11-NEXT: s_cbranch_execnz .LBB8_1
475 ; GFX11-NEXT: ; %bb.2:
476 ; GFX11-NEXT: s_mov_b32 exec_lo, s1
477 ; GFX11-NEXT: s_waitcnt vmcnt(0)
478 ; GFX11-NEXT: ; return to shader part epilog
479 %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr)
480 %r = bitcast <4 x i32> %v to <4 x float>
484 define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16_vgpr_descr(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr) {
485 ; GFX1030-LABEL: image_bvh64_intersect_ray_a16_vgpr_descr:
487 ; GFX1030-NEXT: v_mov_b32_e32 v14, v0
488 ; GFX1030-NEXT: v_mov_b32_e32 v15, v1
489 ; GFX1030-NEXT: v_lshrrev_b32_e32 v0, 16, v6
490 ; GFX1030-NEXT: v_and_b32_e32 v1, 0xffff, v8
491 ; GFX1030-NEXT: v_mov_b32_e32 v16, v2
492 ; GFX1030-NEXT: v_and_b32_e32 v2, 0xffff, v9
493 ; GFX1030-NEXT: v_mov_b32_e32 v17, v3
494 ; GFX1030-NEXT: v_lshlrev_b32_e32 v0, 16, v0
495 ; GFX1030-NEXT: v_lshlrev_b32_e32 v1, 16, v1
496 ; GFX1030-NEXT: v_mov_b32_e32 v18, v4
497 ; GFX1030-NEXT: v_mov_b32_e32 v19, v5
498 ; GFX1030-NEXT: v_alignbit_b32 v22, v2, v8, 16
499 ; GFX1030-NEXT: v_and_or_b32 v20, v6, 0xffff, v0
500 ; GFX1030-NEXT: v_and_or_b32 v21, v7, 0xffff, v1
501 ; GFX1030-NEXT: s_mov_b32 s1, exec_lo
502 ; GFX1030-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1
503 ; GFX1030-NEXT: v_readfirstlane_b32 s4, v10
504 ; GFX1030-NEXT: v_readfirstlane_b32 s5, v11
505 ; GFX1030-NEXT: v_readfirstlane_b32 s6, v12
506 ; GFX1030-NEXT: v_readfirstlane_b32 s7, v13
507 ; GFX1030-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[10:11]
508 ; GFX1030-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[12:13]
509 ; GFX1030-NEXT: s_and_b32 s0, vcc_lo, s0
510 ; GFX1030-NEXT: s_and_saveexec_b32 s0, s0
511 ; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[14:22], s[4:7] a16
512 ; GFX1030-NEXT: ; implicit-def: $vgpr10
513 ; GFX1030-NEXT: ; implicit-def: $vgpr14
514 ; GFX1030-NEXT: ; implicit-def: $vgpr15
515 ; GFX1030-NEXT: ; implicit-def: $vgpr16
516 ; GFX1030-NEXT: ; implicit-def: $vgpr17
517 ; GFX1030-NEXT: ; implicit-def: $vgpr18
518 ; GFX1030-NEXT: ; implicit-def: $vgpr19
519 ; GFX1030-NEXT: ; implicit-def: $vgpr20
520 ; GFX1030-NEXT: ; implicit-def: $vgpr21
521 ; GFX1030-NEXT: ; implicit-def: $vgpr22
522 ; GFX1030-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13
523 ; GFX1030-NEXT: s_xor_b32 exec_lo, exec_lo, s0
524 ; GFX1030-NEXT: s_cbranch_execnz .LBB9_1
525 ; GFX1030-NEXT: ; %bb.2:
526 ; GFX1030-NEXT: s_mov_b32 exec_lo, s1
527 ; GFX1030-NEXT: s_waitcnt vmcnt(0)
528 ; GFX1030-NEXT: ; return to shader part epilog
530 ; GFX1013-LABEL: image_bvh64_intersect_ray_a16_vgpr_descr:
532 ; GFX1013-NEXT: v_lshrrev_b32_e32 v14, 16, v6
533 ; GFX1013-NEXT: v_and_b32_e32 v15, 0xffff, v8
534 ; GFX1013-NEXT: v_and_b32_e32 v9, 0xffff, v9
535 ; GFX1013-NEXT: s_mov_b32 s1, exec_lo
536 ; GFX1013-NEXT: v_lshlrev_b32_e32 v14, 16, v14
537 ; GFX1013-NEXT: v_lshlrev_b32_e32 v15, 16, v15
538 ; GFX1013-NEXT: v_alignbit_b32 v8, v9, v8, 16
539 ; GFX1013-NEXT: v_and_or_b32 v6, v6, 0xffff, v14
540 ; GFX1013-NEXT: v_and_or_b32 v7, v7, 0xffff, v15
541 ; GFX1013-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1
542 ; GFX1013-NEXT: v_readfirstlane_b32 s4, v10
543 ; GFX1013-NEXT: v_readfirstlane_b32 s5, v11
544 ; GFX1013-NEXT: v_readfirstlane_b32 s6, v12
545 ; GFX1013-NEXT: v_readfirstlane_b32 s7, v13
546 ; GFX1013-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[10:11]
547 ; GFX1013-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[12:13]
548 ; GFX1013-NEXT: s_and_b32 s0, vcc_lo, s0
549 ; GFX1013-NEXT: s_and_saveexec_b32 s0, s0
550 ; GFX1013-NEXT: image_bvh64_intersect_ray v[14:17], v[0:8], s[4:7] a16
551 ; GFX1013-NEXT: ; implicit-def: $vgpr10
552 ; GFX1013-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8
553 ; GFX1013-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13
554 ; GFX1013-NEXT: s_waitcnt_depctr 0xffe3
555 ; GFX1013-NEXT: s_xor_b32 exec_lo, exec_lo, s0
556 ; GFX1013-NEXT: s_cbranch_execnz .LBB9_1
557 ; GFX1013-NEXT: ; %bb.2:
558 ; GFX1013-NEXT: s_mov_b32 exec_lo, s1
559 ; GFX1013-NEXT: s_waitcnt vmcnt(0)
560 ; GFX1013-NEXT: v_mov_b32_e32 v0, v14
561 ; GFX1013-NEXT: v_mov_b32_e32 v1, v15
562 ; GFX1013-NEXT: v_mov_b32_e32 v2, v16
563 ; GFX1013-NEXT: v_mov_b32_e32 v3, v17
564 ; GFX1013-NEXT: ; return to shader part epilog
566 ; GFX11-LABEL: image_bvh64_intersect_ray_a16_vgpr_descr:
568 ; GFX11-NEXT: v_dual_mov_b32 v17, v0 :: v_dual_mov_b32 v18, v1
569 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v8
570 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v9
571 ; GFX11-NEXT: v_dual_mov_b32 v19, v2 :: v_dual_mov_b32 v14, v3
572 ; GFX11-NEXT: v_dual_mov_b32 v15, v4 :: v_dual_mov_b32 v16, v5
573 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
574 ; GFX11-NEXT: v_lshl_or_b32 v4, v6, 16, v0
575 ; GFX11-NEXT: v_perm_b32 v5, v6, v8, 0x7060302
576 ; GFX11-NEXT: v_lshl_or_b32 v6, v7, 16, v1
577 ; GFX11-NEXT: s_mov_b32 s1, exec_lo
578 ; GFX11-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1
579 ; GFX11-NEXT: v_readfirstlane_b32 s4, v10
580 ; GFX11-NEXT: v_readfirstlane_b32 s5, v11
581 ; GFX11-NEXT: v_readfirstlane_b32 s6, v12
582 ; GFX11-NEXT: v_readfirstlane_b32 s7, v13
583 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
584 ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[10:11]
585 ; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[12:13]
586 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
587 ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
588 ; GFX11-NEXT: s_and_saveexec_b32 s0, s0
589 ; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[17:18], v19, v[14:16], v[4:6]], s[4:7] a16
590 ; GFX11-NEXT: ; implicit-def: $vgpr10
591 ; GFX11-NEXT: ; implicit-def: $vgpr17_vgpr18
592 ; GFX11-NEXT: ; implicit-def: $vgpr19
593 ; GFX11-NEXT: ; implicit-def: $vgpr14_vgpr15_vgpr16
594 ; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6
595 ; GFX11-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13
596 ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
597 ; GFX11-NEXT: s_cbranch_execnz .LBB9_1
598 ; GFX11-NEXT: ; %bb.2:
599 ; GFX11-NEXT: s_mov_b32 exec_lo, s1
600 ; GFX11-NEXT: s_waitcnt vmcnt(0)
601 ; GFX11-NEXT: ; return to shader part epilog
602 %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr)
603 %r = bitcast <4 x i32> %v to <4 x float>
607 define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ptr %p_ray, <4 x i32> inreg %tdescr) {
608 ; GFX1030-LABEL: image_bvh_intersect_ray_nsa_reassign:
610 ; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
611 ; GFX1030-NEXT: v_lshlrev_b32_e32 v4, 2, v0
612 ; GFX1030-NEXT: v_mov_b32_e32 v5, 0x40400000
613 ; GFX1030-NEXT: v_mov_b32_e32 v6, 4.0
614 ; GFX1030-NEXT: v_mov_b32_e32 v7, 0x40a00000
615 ; GFX1030-NEXT: v_mov_b32_e32 v8, 0x40c00000
616 ; GFX1030-NEXT: v_mov_b32_e32 v9, 0x40e00000
617 ; GFX1030-NEXT: v_mov_b32_e32 v10, 0x41000000
618 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
619 ; GFX1030-NEXT: v_mov_b32_e32 v0, s0
620 ; GFX1030-NEXT: v_mov_b32_e32 v1, s1
621 ; GFX1030-NEXT: v_mov_b32_e32 v2, s2
622 ; GFX1030-NEXT: v_mov_b32_e32 v3, s3
623 ; GFX1030-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
624 ; GFX1030-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
625 ; GFX1030-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
626 ; GFX1030-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
627 ; GFX1030-NEXT: v_mov_b32_e32 v4, 2.0
628 ; GFX1030-NEXT: flat_load_dword v0, v[0:1]
629 ; GFX1030-NEXT: flat_load_dword v1, v[2:3]
630 ; GFX1030-NEXT: v_mov_b32_e32 v2, 0
631 ; GFX1030-NEXT: v_mov_b32_e32 v3, 1.0
632 ; GFX1030-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
633 ; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], v[0:10], s[4:7]
634 ; GFX1030-NEXT: s_waitcnt vmcnt(0)
635 ; GFX1030-NEXT: flat_store_dwordx4 v[0:1], v[0:3]
636 ; GFX1030-NEXT: s_endpgm
638 ; GFX1013-LABEL: image_bvh_intersect_ray_nsa_reassign:
640 ; GFX1013-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
641 ; GFX1013-NEXT: v_lshlrev_b32_e32 v6, 2, v0
642 ; GFX1013-NEXT: v_mov_b32_e32 v7, 0x40a00000
643 ; GFX1013-NEXT: v_mov_b32_e32 v8, 0x40c00000
644 ; GFX1013-NEXT: v_mov_b32_e32 v9, 0x40e00000
645 ; GFX1013-NEXT: v_mov_b32_e32 v10, 0x41000000
646 ; GFX1013-NEXT: s_waitcnt lgkmcnt(0)
647 ; GFX1013-NEXT: v_mov_b32_e32 v0, s0
648 ; GFX1013-NEXT: v_mov_b32_e32 v1, s1
649 ; GFX1013-NEXT: v_mov_b32_e32 v2, s2
650 ; GFX1013-NEXT: v_mov_b32_e32 v3, s3
651 ; GFX1013-NEXT: v_add_co_u32 v4, vcc_lo, v0, v6
652 ; GFX1013-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
653 ; GFX1013-NEXT: v_add_co_u32 v2, vcc_lo, v2, v6
654 ; GFX1013-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
655 ; GFX1013-NEXT: v_mov_b32_e32 v6, 4.0
656 ; GFX1013-NEXT: flat_load_dword v0, v[4:5]
657 ; GFX1013-NEXT: flat_load_dword v1, v[2:3]
658 ; GFX1013-NEXT: v_mov_b32_e32 v2, 0
659 ; GFX1013-NEXT: v_mov_b32_e32 v3, 1.0
660 ; GFX1013-NEXT: v_mov_b32_e32 v4, 2.0
661 ; GFX1013-NEXT: v_mov_b32_e32 v5, 0x40400000
662 ; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
663 ; GFX1013-NEXT: image_bvh_intersect_ray v[0:3], v[0:10], s[4:7]
664 ; GFX1013-NEXT: s_waitcnt vmcnt(0)
665 ; GFX1013-NEXT: flat_store_dwordx4 v[0:1], v[0:3]
666 ; GFX1013-NEXT: s_endpgm
668 ; GFX11-LABEL: image_bvh_intersect_ray_nsa_reassign:
670 ; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
671 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 2, v0
672 ; GFX11-NEXT: s_mov_b32 s8, 0x40400000
673 ; GFX11-NEXT: s_mov_b32 s12, 0x40c00000
674 ; GFX11-NEXT: s_mov_b32 s10, 0x40a00000
675 ; GFX11-NEXT: s_mov_b32 s9, 4.0
676 ; GFX11-NEXT: s_mov_b32 s14, 0x41000000
677 ; GFX11-NEXT: s_mov_b32 s13, 0x40e00000
678 ; GFX11-NEXT: v_mov_b32_e32 v6, s12
679 ; GFX11-NEXT: v_dual_mov_b32 v8, s14 :: v_dual_mov_b32 v7, s13
680 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
681 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
682 ; GFX11-NEXT: s_mov_b32 s1, 1.0
683 ; GFX11-NEXT: s_mov_b32 s0, 0
684 ; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
685 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
686 ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
687 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
688 ; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
689 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
690 ; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
691 ; GFX11-NEXT: flat_load_b32 v9, v[0:1]
692 ; GFX11-NEXT: flat_load_b32 v10, v[2:3]
693 ; GFX11-NEXT: s_mov_b32 s2, 2.0
694 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s8
695 ; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
696 ; GFX11-NEXT: v_dual_mov_b32 v5, s10 :: v_dual_mov_b32 v4, s9
697 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
698 ; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v9, v10, v[0:2], v[3:5], v[6:8]], s[4:7]
699 ; GFX11-NEXT: s_waitcnt vmcnt(0)
700 ; GFX11-NEXT: flat_store_b128 v[0:1], v[0:3]
701 ; GFX11-NEXT: s_endpgm
702 %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
703 %gep_node_ptr = getelementptr inbounds i32, ptr %p_node_ptr, i32 %lid
704 %node_ptr = load i32, ptr %gep_node_ptr, align 4
705 %gep_ray = getelementptr inbounds float, ptr %p_ray, i32 %lid
706 %ray_extent = load float, ptr %gep_ray, align 4
707 %ray_origin0 = insertelement <3 x float> undef, float 0.0, i32 0
708 %ray_origin1 = insertelement <3 x float> %ray_origin0, float 1.0, i32 1
709 %ray_origin = insertelement <3 x float> %ray_origin1, float 2.0, i32 2
710 %ray_dir0 = insertelement <3 x float> undef, float 3.0, i32 0
711 %ray_dir1 = insertelement <3 x float> %ray_dir0, float 4.0, i32 1
712 %ray_dir = insertelement <3 x float> %ray_dir1, float 5.0, i32 2
713 %ray_inv_dir0 = insertelement <3 x float> undef, float 6.0, i32 0
714 %ray_inv_dir1 = insertelement <3 x float> %ray_inv_dir0, float 7.0, i32 1
715 %ray_inv_dir = insertelement <3 x float> %ray_inv_dir1, float 8.0, i32 2
716 %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr)
717 store <4 x i32> %v, ptr undef
721 define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ptr, ptr %p_ray, <4 x i32> inreg %tdescr) {
722 ; GFX1030-LABEL: image_bvh_intersect_ray_a16_nsa_reassign:
724 ; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
725 ; GFX1030-NEXT: v_lshlrev_b32_e32 v4, 2, v0
726 ; GFX1030-NEXT: v_mov_b32_e32 v5, 0x44004200
727 ; GFX1030-NEXT: v_mov_b32_e32 v6, 0x46004500
728 ; GFX1030-NEXT: v_mov_b32_e32 v7, 0x48004700
729 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
730 ; GFX1030-NEXT: v_mov_b32_e32 v0, s0
731 ; GFX1030-NEXT: v_mov_b32_e32 v1, s1
732 ; GFX1030-NEXT: v_mov_b32_e32 v2, s2
733 ; GFX1030-NEXT: v_mov_b32_e32 v3, s3
734 ; GFX1030-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
735 ; GFX1030-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
736 ; GFX1030-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
737 ; GFX1030-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
738 ; GFX1030-NEXT: v_mov_b32_e32 v4, 2.0
739 ; GFX1030-NEXT: flat_load_dword v0, v[0:1]
740 ; GFX1030-NEXT: flat_load_dword v1, v[2:3]
741 ; GFX1030-NEXT: v_mov_b32_e32 v2, 0
742 ; GFX1030-NEXT: v_mov_b32_e32 v3, 1.0
743 ; GFX1030-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
744 ; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], v[0:7], s[4:7] a16
745 ; GFX1030-NEXT: s_waitcnt vmcnt(0)
746 ; GFX1030-NEXT: flat_store_dwordx4 v[0:1], v[0:3]
747 ; GFX1030-NEXT: s_endpgm
749 ; GFX1013-LABEL: image_bvh_intersect_ray_a16_nsa_reassign:
751 ; GFX1013-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
752 ; GFX1013-NEXT: v_lshlrev_b32_e32 v6, 2, v0
753 ; GFX1013-NEXT: v_mov_b32_e32 v7, 0x48004700
754 ; GFX1013-NEXT: s_waitcnt lgkmcnt(0)
755 ; GFX1013-NEXT: v_mov_b32_e32 v0, s0
756 ; GFX1013-NEXT: v_mov_b32_e32 v1, s1
757 ; GFX1013-NEXT: v_mov_b32_e32 v2, s2
758 ; GFX1013-NEXT: v_mov_b32_e32 v3, s3
759 ; GFX1013-NEXT: v_add_co_u32 v4, vcc_lo, v0, v6
760 ; GFX1013-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
761 ; GFX1013-NEXT: v_add_co_u32 v2, vcc_lo, v2, v6
762 ; GFX1013-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
763 ; GFX1013-NEXT: v_mov_b32_e32 v6, 0x46004500
764 ; GFX1013-NEXT: flat_load_dword v0, v[4:5]
765 ; GFX1013-NEXT: flat_load_dword v1, v[2:3]
766 ; GFX1013-NEXT: v_mov_b32_e32 v2, 0
767 ; GFX1013-NEXT: v_mov_b32_e32 v3, 1.0
768 ; GFX1013-NEXT: v_mov_b32_e32 v4, 2.0
769 ; GFX1013-NEXT: v_mov_b32_e32 v5, 0x44004200
770 ; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
771 ; GFX1013-NEXT: image_bvh_intersect_ray v[0:3], v[0:7], s[4:7] a16
772 ; GFX1013-NEXT: s_waitcnt vmcnt(0)
773 ; GFX1013-NEXT: flat_store_dwordx4 v[0:1], v[0:3]
774 ; GFX1013-NEXT: s_endpgm
776 ; GFX11-LABEL: image_bvh_intersect_ray_a16_nsa_reassign:
778 ; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
779 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 2, v0
780 ; GFX11-NEXT: s_mov_b32 s8, 0x42004600
781 ; GFX11-NEXT: s_mov_b32 s9, 0x44004700
782 ; GFX11-NEXT: s_mov_b32 s10, 0x45004800
783 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
784 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
785 ; GFX11-NEXT: s_mov_b32 s1, 1.0
786 ; GFX11-NEXT: s_mov_b32 s0, 0
787 ; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
788 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
789 ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
790 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
791 ; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
792 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
793 ; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
794 ; GFX11-NEXT: flat_load_b32 v6, v[0:1]
795 ; GFX11-NEXT: flat_load_b32 v7, v[2:3]
796 ; GFX11-NEXT: s_mov_b32 s2, 2.0
797 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s8
798 ; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
799 ; GFX11-NEXT: v_dual_mov_b32 v5, s10 :: v_dual_mov_b32 v4, s9
800 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
801 ; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v6, v7, v[0:2], v[3:5]], s[4:7] a16
802 ; GFX11-NEXT: s_waitcnt vmcnt(0)
803 ; GFX11-NEXT: flat_store_b128 v[0:1], v[0:3]
804 ; GFX11-NEXT: s_endpgm
805 %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
806 %gep_node_ptr = getelementptr inbounds i32, ptr %p_node_ptr, i32 %lid
807 %node_ptr = load i32, ptr %gep_node_ptr, align 4
808 %gep_ray = getelementptr inbounds float, ptr %p_ray, i32 %lid
809 %ray_extent = load float, ptr %gep_ray, align 4
810 %ray_origin0 = insertelement <3 x float> undef, float 0.0, i32 0
811 %ray_origin1 = insertelement <3 x float> %ray_origin0, float 1.0, i32 1
812 %ray_origin = insertelement <3 x float> %ray_origin1, float 2.0, i32 2
813 %ray_dir0 = insertelement <3 x half> undef, half 3.0, i32 0
814 %ray_dir1 = insertelement <3 x half> %ray_dir0, half 4.0, i32 1
815 %ray_dir = insertelement <3 x half> %ray_dir1, half 5.0, i32 2
816 %ray_inv_dir0 = insertelement <3 x half> undef, half 6.0, i32 0
817 %ray_inv_dir1 = insertelement <3 x half> %ray_inv_dir0, half 7.0, i32 1
818 %ray_inv_dir = insertelement <3 x half> %ray_inv_dir1, half 8.0, i32 2
819 %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr)
820 store <4 x i32> %v, ptr undef
824 define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 x i32> inreg %tdescr) {
825 ; GFX1030-LABEL: image_bvh64_intersect_ray_nsa_reassign:
827 ; GFX1030-NEXT: s_clause 0x1
828 ; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
829 ; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34
830 ; GFX1030-NEXT: v_lshlrev_b32_e32 v2, 2, v0
831 ; GFX1030-NEXT: v_mov_b32_e32 v3, 0
832 ; GFX1030-NEXT: v_mov_b32_e32 v4, 1.0
833 ; GFX1030-NEXT: v_mov_b32_e32 v5, 2.0
834 ; GFX1030-NEXT: v_mov_b32_e32 v6, 0x40400000
835 ; GFX1030-NEXT: v_mov_b32_e32 v7, 4.0
836 ; GFX1030-NEXT: v_mov_b32_e32 v8, 0x40a00000
837 ; GFX1030-NEXT: v_mov_b32_e32 v9, 0x40c00000
838 ; GFX1030-NEXT: v_mov_b32_e32 v10, 0x40e00000
839 ; GFX1030-NEXT: v_mov_b32_e32 v11, 0x41000000
840 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
841 ; GFX1030-NEXT: v_mov_b32_e32 v0, s4
842 ; GFX1030-NEXT: v_mov_b32_e32 v1, s5
843 ; GFX1030-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
844 ; GFX1030-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
845 ; GFX1030-NEXT: flat_load_dword v2, v[0:1]
846 ; GFX1030-NEXT: v_mov_b32_e32 v0, 0xb36211c7
847 ; GFX1030-NEXT: v_mov_b32_e32 v1, 0x102
848 ; GFX1030-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
849 ; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[0:11], s[0:3]
850 ; GFX1030-NEXT: s_waitcnt vmcnt(0)
851 ; GFX1030-NEXT: flat_store_dwordx4 v[0:1], v[0:3]
852 ; GFX1030-NEXT: s_endpgm
854 ; GFX1013-LABEL: image_bvh64_intersect_ray_nsa_reassign:
856 ; GFX1013-NEXT: s_clause 0x1
857 ; GFX1013-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
858 ; GFX1013-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
859 ; GFX1013-NEXT: v_lshlrev_b32_e32 v2, 2, v0
860 ; GFX1013-NEXT: v_mov_b32_e32 v3, 0
861 ; GFX1013-NEXT: v_mov_b32_e32 v4, 1.0
862 ; GFX1013-NEXT: v_mov_b32_e32 v5, 2.0
863 ; GFX1013-NEXT: v_mov_b32_e32 v6, 0x40400000
864 ; GFX1013-NEXT: v_mov_b32_e32 v7, 4.0
865 ; GFX1013-NEXT: v_mov_b32_e32 v8, 0x40a00000
866 ; GFX1013-NEXT: v_mov_b32_e32 v9, 0x40c00000
867 ; GFX1013-NEXT: v_mov_b32_e32 v10, 0x40e00000
868 ; GFX1013-NEXT: v_mov_b32_e32 v11, 0x41000000
869 ; GFX1013-NEXT: s_waitcnt lgkmcnt(0)
870 ; GFX1013-NEXT: v_mov_b32_e32 v0, s2
871 ; GFX1013-NEXT: v_mov_b32_e32 v1, s3
872 ; GFX1013-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
873 ; GFX1013-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
874 ; GFX1013-NEXT: flat_load_dword v2, v[0:1]
875 ; GFX1013-NEXT: v_mov_b32_e32 v0, 0xb36211c7
876 ; GFX1013-NEXT: v_mov_b32_e32 v1, 0x102
877 ; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
878 ; GFX1013-NEXT: image_bvh64_intersect_ray v[0:3], v[0:11], s[4:7]
879 ; GFX1013-NEXT: s_waitcnt vmcnt(0)
880 ; GFX1013-NEXT: flat_store_dwordx4 v[0:1], v[0:3]
881 ; GFX1013-NEXT: s_endpgm
883 ; GFX11-LABEL: image_bvh64_intersect_ray_nsa_reassign:
885 ; GFX11-NEXT: s_clause 0x1
886 ; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
887 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x34
888 ; GFX11-NEXT: s_mov_b32 s16, 0xb36211c7
889 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0
890 ; GFX11-NEXT: s_movk_i32 s17, 0x102
891 ; GFX11-NEXT: s_mov_b32 s8, 0x40400000
892 ; GFX11-NEXT: s_mov_b32 s12, 0x40c00000
893 ; GFX11-NEXT: s_mov_b32 s6, 2.0
894 ; GFX11-NEXT: s_mov_b32 s10, 0x40a00000
895 ; GFX11-NEXT: s_mov_b32 s9, 4.0
896 ; GFX11-NEXT: s_mov_b32 s14, 0x41000000
897 ; GFX11-NEXT: s_mov_b32 s13, 0x40e00000
898 ; GFX11-NEXT: v_mov_b32_e32 v6, s12
899 ; GFX11-NEXT: v_dual_mov_b32 v8, s14 :: v_dual_mov_b32 v9, s16
900 ; GFX11-NEXT: v_dual_mov_b32 v3, s8 :: v_dual_mov_b32 v4, s9
901 ; GFX11-NEXT: v_mov_b32_e32 v7, s13
902 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
903 ; GFX11-NEXT: v_dual_mov_b32 v5, s10 :: v_dual_mov_b32 v0, s4
904 ; GFX11-NEXT: v_mov_b32_e32 v1, s5
905 ; GFX11-NEXT: s_mov_b32 s4, 0
906 ; GFX11-NEXT: s_mov_b32 s5, 1.0
907 ; GFX11-NEXT: v_mov_b32_e32 v10, s17
908 ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
909 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
910 ; GFX11-NEXT: flat_load_b32 v11, v[0:1]
911 ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
912 ; GFX11-NEXT: v_mov_b32_e32 v2, s6
913 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
914 ; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[9:10], v11, v[0:2], v[3:5], v[6:8]], s[0:3]
915 ; GFX11-NEXT: s_waitcnt vmcnt(0)
916 ; GFX11-NEXT: flat_store_b128 v[0:1], v[0:3]
917 ; GFX11-NEXT: s_endpgm
918 %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
919 %gep_ray = getelementptr inbounds float, ptr %p_ray, i32 %lid
920 %ray_extent = load float, ptr %gep_ray, align 4
921 %ray_origin0 = insertelement <3 x float> undef, float 0.0, i32 0
922 %ray_origin1 = insertelement <3 x float> %ray_origin0, float 1.0, i32 1
923 %ray_origin = insertelement <3 x float> %ray_origin1, float 2.0, i32 2
924 %ray_dir0 = insertelement <3 x float> undef, float 3.0, i32 0
925 %ray_dir1 = insertelement <3 x float> %ray_dir0, float 4.0, i32 1
926 %ray_dir = insertelement <3 x float> %ray_dir1, float 5.0, i32 2
927 %ray_inv_dir0 = insertelement <3 x float> undef, float 6.0, i32 0
928 %ray_inv_dir1 = insertelement <3 x float> %ray_inv_dir0, float 7.0, i32 1
929 %ray_inv_dir = insertelement <3 x float> %ray_inv_dir1, float 8.0, i32 2
930 %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 1111111111111, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr)
931 store <4 x i32> %v, ptr undef
935 define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray, <4 x i32> inreg %tdescr) {
936 ; GFX1030-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign:
938 ; GFX1030-NEXT: s_clause 0x1
939 ; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
940 ; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34
941 ; GFX1030-NEXT: v_lshlrev_b32_e32 v2, 2, v0
942 ; GFX1030-NEXT: v_mov_b32_e32 v3, 0
943 ; GFX1030-NEXT: v_mov_b32_e32 v4, 1.0
944 ; GFX1030-NEXT: v_mov_b32_e32 v5, 2.0
945 ; GFX1030-NEXT: v_mov_b32_e32 v6, 0x44004200
946 ; GFX1030-NEXT: v_mov_b32_e32 v7, 0x46004500
947 ; GFX1030-NEXT: v_mov_b32_e32 v8, 0x48004700
948 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
949 ; GFX1030-NEXT: v_mov_b32_e32 v0, s4
950 ; GFX1030-NEXT: v_mov_b32_e32 v1, s5
951 ; GFX1030-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
952 ; GFX1030-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
953 ; GFX1030-NEXT: flat_load_dword v2, v[0:1]
954 ; GFX1030-NEXT: v_mov_b32_e32 v0, 0xb36211c6
955 ; GFX1030-NEXT: v_mov_b32_e32 v1, 0x102
956 ; GFX1030-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
957 ; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[0:8], s[0:3] a16
958 ; GFX1030-NEXT: s_waitcnt vmcnt(0)
959 ; GFX1030-NEXT: flat_store_dwordx4 v[0:1], v[0:3]
960 ; GFX1030-NEXT: s_endpgm
962 ; GFX1013-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign:
964 ; GFX1013-NEXT: s_clause 0x1
965 ; GFX1013-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
966 ; GFX1013-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
967 ; GFX1013-NEXT: v_lshlrev_b32_e32 v2, 2, v0
968 ; GFX1013-NEXT: v_mov_b32_e32 v3, 0
969 ; GFX1013-NEXT: v_mov_b32_e32 v4, 1.0
970 ; GFX1013-NEXT: v_mov_b32_e32 v5, 2.0
971 ; GFX1013-NEXT: v_mov_b32_e32 v6, 0x44004200
972 ; GFX1013-NEXT: v_mov_b32_e32 v7, 0x46004500
973 ; GFX1013-NEXT: v_mov_b32_e32 v8, 0x48004700
974 ; GFX1013-NEXT: s_waitcnt lgkmcnt(0)
975 ; GFX1013-NEXT: v_mov_b32_e32 v0, s2
976 ; GFX1013-NEXT: v_mov_b32_e32 v1, s3
977 ; GFX1013-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
978 ; GFX1013-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
979 ; GFX1013-NEXT: flat_load_dword v2, v[0:1]
980 ; GFX1013-NEXT: v_mov_b32_e32 v0, 0xb36211c6
981 ; GFX1013-NEXT: v_mov_b32_e32 v1, 0x102
982 ; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
983 ; GFX1013-NEXT: image_bvh64_intersect_ray v[0:3], v[0:8], s[4:7] a16
984 ; GFX1013-NEXT: s_waitcnt vmcnt(0)
985 ; GFX1013-NEXT: flat_store_dwordx4 v[0:1], v[0:3]
986 ; GFX1013-NEXT: s_endpgm
988 ; GFX11-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign:
990 ; GFX11-NEXT: s_clause 0x1
991 ; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
992 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x34
993 ; GFX11-NEXT: s_mov_b32 s12, 0xb36211c6
994 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0
995 ; GFX11-NEXT: s_movk_i32 s13, 0x102
996 ; GFX11-NEXT: s_mov_b32 s6, 2.0
997 ; GFX11-NEXT: s_mov_b32 s8, 0x42004600
998 ; GFX11-NEXT: s_mov_b32 s9, 0x44004700
999 ; GFX11-NEXT: s_mov_b32 s10, 0x45004800
1000 ; GFX11-NEXT: v_dual_mov_b32 v3, s8 :: v_dual_mov_b32 v4, s9
1001 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1002 ; GFX11-NEXT: v_dual_mov_b32 v5, s10 :: v_dual_mov_b32 v0, s4
1003 ; GFX11-NEXT: v_mov_b32_e32 v1, s5
1004 ; GFX11-NEXT: s_mov_b32 s5, 1.0
1005 ; GFX11-NEXT: s_mov_b32 s4, 0
1006 ; GFX11-NEXT: v_dual_mov_b32 v6, s12 :: v_dual_mov_b32 v7, s13
1007 ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
1008 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
1009 ; GFX11-NEXT: flat_load_b32 v8, v[0:1]
1010 ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
1011 ; GFX11-NEXT: v_mov_b32_e32 v2, s6
1012 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1013 ; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[6:7], v8, v[0:2], v[3:5]], s[0:3] a16
1014 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1015 ; GFX11-NEXT: flat_store_b128 v[0:1], v[0:3]
1016 ; GFX11-NEXT: s_endpgm
1017 %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
1018 %gep_ray = getelementptr inbounds float, ptr %p_ray, i32 %lid
1019 %ray_extent = load float, ptr %gep_ray, align 4
1020 %ray_origin0 = insertelement <3 x float> undef, float 0.0, i32 0
1021 %ray_origin1 = insertelement <3 x float> %ray_origin0, float 1.0, i32 1
1022 %ray_origin = insertelement <3 x float> %ray_origin1, float 2.0, i32 2
1023 %ray_dir0 = insertelement <3 x half> undef, half 3.0, i32 0
1024 %ray_dir1 = insertelement <3 x half> %ray_dir0, half 4.0, i32 1
1025 %ray_dir = insertelement <3 x half> %ray_dir1, half 5.0, i32 2
1026 %ray_inv_dir0 = insertelement <3 x half> undef, half 6.0, i32 0
1027 %ray_inv_dir1 = insertelement <3 x half> %ray_inv_dir0, half 7.0, i32 1
1028 %ray_inv_dir = insertelement <3 x half> %ray_inv_dir1, half 8.0, i32 2
1029 %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64 1111111111110, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr)
1030 store <4 x i32> %v, ptr undef