llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
   2 ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1030 %s
   3 ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1013 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1013 %s
   4 ; RUN: not --crash llc -global-isel -march=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s -o /dev/null 2>&1 | FileCheck -check-prefix=ERR %s
   5
   6 ; uint4 llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(uint node_ptr, float ray_extent, float3 ray_origin, float3 ray_dir, float3 ray_inv_dir, uint4 texture_descr)
   7 ; uint4 llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(uint node_ptr, float ray_extent, float3 ray_origin, half3 ray_dir, half3 ray_inv_dir, uint4 texture_descr)
   8 ; uint4 llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(ulong node_ptr, float ray_extent, float3 ray_origin, float3 ray_dir, float3 ray_inv_dir, uint4 texture_descr)
   9 ; uint4 llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(ulong node_ptr, float ray_extent, float3 ray_origin, half3 ray_dir, half3 ray_inv_dir, uint4 texture_descr)
  10
  11 declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32, float, <3 x float>, <3 x float>, <3 x float>, <4 x i32>)
  12 declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32, float, <3 x float>, <3 x half>, <3 x half>, <4 x i32>)
  13 declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64, float, <3 x float>, <3 x float>, <3 x float>, <4 x i32>)
  14 declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64, float, <3 x float>, <3 x half>, <3 x half>, <4 x i32>)
  15 declare i32 @llvm.amdgcn.workitem.id.x()
  16
  17 define amdgpu_ps <4 x float> @image_bvh_intersect_ray(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> inreg %tdescr) {
  18 ; GCN-LABEL: image_bvh_intersect_ray:
  19 ; GCN:       ; %bb.0:
  20 ; GCN-NEXT:    image_bvh_intersect_ray v[0:3], v[0:15], s[0:3]
  21 ; GCN-NEXT:    s_waitcnt vmcnt(0)
  22 ; GCN-NEXT:    ; return to shader part epilog
  23 ; ERR: in function image_bvh_intersect_ray{{.*}}intrinsic not supported on subtarget
  24   %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr)
  25   %r = bitcast <4 x i32> %v to <4 x float>
  26   ret <4 x float> %r
  27 }
  28
  29 define amdgpu_ps <4 x float> @image_bvh_intersect_ray_flat(i32 %node_ptr, float %ray_extent, float %ray_origin_x, float %ray_origin_y, float %ray_origin_z, float %ray_dir_x, float %ray_dir_y, float %ray_dir_z, float %ray_inv_dir_x, float %ray_inv_dir_y, float %ray_inv_dir_z, <4 x i32> inreg %tdescr) {
  30 ; GCN-LABEL: image_bvh_intersect_ray_flat:
  31 ; GCN:       ; %bb.0:
  32 ; GCN-NEXT:    image_bvh_intersect_ray v[0:3], v[0:15], s[0:3]
  33 ; GCN-NEXT:    s_waitcnt vmcnt(0)
  34 ; GCN-NEXT:    ; return to shader part epilog
  35   %ray_origin0 = insertelement <3 x float> undef, float %ray_origin_x, i32 0
  36   %ray_origin1 = insertelement <3 x float> %ray_origin0, float %ray_origin_y, i32 1
  37   %ray_origin = insertelement <3 x float> %ray_origin1, float %ray_origin_z, i32 2
  38   %ray_dir0 = insertelement <3 x float> undef, float %ray_dir_x, i32 0
  39   %ray_dir1 = insertelement <3 x float> %ray_dir0, float %ray_dir_y, i32 1
  40   %ray_dir = insertelement <3 x float> %ray_dir1, float %ray_dir_z, i32 2
  41   %ray_inv_dir0 = insertelement <3 x float> undef, float %ray_inv_dir_x, i32 0
  42   %ray_inv_dir1 = insertelement <3 x float> %ray_inv_dir0, float %ray_inv_dir_y, i32 1
  43   %ray_inv_dir = insertelement <3 x float> %ray_inv_dir1, float %ray_inv_dir_z, i32 2
  44   %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr)
  45  %r = bitcast <4 x i32> %v to <4 x float>
  46  ret <4 x float> %r
  47 }
  48
  49 define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> inreg %tdescr) {
  50 ; GCN-LABEL: image_bvh_intersect_ray_a16:
  51 ; GCN:       ; %bb.0:
  52 ; GCN-NEXT:    s_mov_b32 s4, 0xffff
  53 ; GCN-NEXT:    v_lshrrev_b32_e32 v9, 16, v5
  54 ; GCN-NEXT:    v_and_b32_e32 v10, s4, v7
  55 ; GCN-NEXT:    v_and_b32_e32 v8, s4, v8
  56 ; GCN-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
  57 ; GCN-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
  58 ; GCN-NEXT:    v_alignbit_b32 v7, v8, v7, 16
  59 ; GCN-NEXT:    v_and_or_b32 v5, v5, s4, v9
  60 ; GCN-NEXT:    v_and_or_b32 v6, v6, s4, v10
  61 ; GCN-NEXT:    image_bvh_intersect_ray v[0:3], v[0:7], s[0:3] a16
  62 ; GCN-NEXT:    s_waitcnt vmcnt(0)
  63 ; GCN-NEXT:    ; return to shader part epilog
  64   %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr)
  65   %r = bitcast <4 x i32> %v to <4 x float>
  66   ret <4 x float> %r
  67 }
  68
  69 define amdgpu_ps <4 x float> @image_bvh64_intersect_ray(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> inreg %tdescr) {
  70 ; GCN-LABEL: image_bvh64_intersect_ray:
  71 ; GCN:       ; %bb.0:
  72 ; GCN-NEXT:    image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3]
  73 ; GCN-NEXT:    s_waitcnt vmcnt(0)
  74 ; GCN-NEXT:    ; return to shader part epilog
  75   %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr)
  76   %r = bitcast <4 x i32> %v to <4 x float>
  77   ret <4 x float> %r
  78 }
  79
  80 define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_flat(<2 x i32> %node_ptr_vec, float %ray_extent, float %ray_origin_x, float %ray_origin_y, float %ray_origin_z, float %ray_dir_x, float %ray_dir_y, float %ray_dir_z, float %ray_inv_dir_x, float %ray_inv_dir_y, float %ray_inv_dir_z, <4 x i32> inreg %tdescr) {
  81 ; GCN-LABEL: image_bvh64_intersect_ray_flat:
  82 ; GCN:       ; %bb.0:
  83 ; GCN-NEXT:    image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3]
  84 ; GCN-NEXT:    s_waitcnt vmcnt(0)
  85 ; GCN-NEXT:    ; return to shader part epilog
  86   %node_ptr = bitcast <2 x i32> %node_ptr_vec to i64
  87   %ray_origin0 = insertelement <3 x float> undef, float %ray_origin_x, i32 0
  88   %ray_origin1 = insertelement <3 x float> %ray_origin0, float %ray_origin_y, i32 1
  89   %ray_origin = insertelement <3 x float> %ray_origin1, float %ray_origin_z, i32 2
  90   %ray_dir0 = insertelement <3 x float> undef, float %ray_dir_x, i32 0
  91   %ray_dir1 = insertelement <3 x float> %ray_dir0, float %ray_dir_y, i32 1
  92   %ray_dir = insertelement <3 x float> %ray_dir1, float %ray_dir_z, i32 2
  93   %ray_inv_dir0 = insertelement <3 x float> undef, float %ray_inv_dir_x, i32 0
  94   %ray_inv_dir1 = insertelement <3 x float> %ray_inv_dir0, float %ray_inv_dir_y, i32 1
  95   %ray_inv_dir = insertelement <3 x float> %ray_inv_dir1, float %ray_inv_dir_z, i32 2
  96   %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr)
  97  %r = bitcast <4 x i32> %v to <4 x float>
  98  ret <4 x float> %r
  99 }
 100
 101 define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> inreg %tdescr) {
 102 ; GCN-LABEL: image_bvh64_intersect_ray_a16:
 103 ; GCN:       ; %bb.0:
 104 ; GCN-NEXT:    s_mov_b32 s4, 0xffff
 105 ; GCN-NEXT:    v_lshrrev_b32_e32 v10, 16, v6
 106 ; GCN-NEXT:    v_and_b32_e32 v11, s4, v8
 107 ; GCN-NEXT:    v_and_b32_e32 v9, s4, v9
 108 ; GCN-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
 109 ; GCN-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
 110 ; GCN-NEXT:    v_alignbit_b32 v8, v9, v8, 16
 111 ; GCN-NEXT:    v_and_or_b32 v6, v6, s4, v10
 112 ; GCN-NEXT:    v_and_or_b32 v7, v7, s4, v11
 113 ; GCN-NEXT:    image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3] a16
 114 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 115 ; GCN-NEXT:    ; return to shader part epilog
 116   %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr)
 117   %r = bitcast <4 x i32> %v to <4 x float>
 118   ret <4 x float> %r
 119 }
 120
 121 define amdgpu_ps <4 x float> @image_bvh_intersect_ray_vgpr_descr(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr) {
 122 ; GFX1030-LABEL: image_bvh_intersect_ray_vgpr_descr:
 123 ; GFX1030:       ; %bb.0:
 124 ; GFX1030-NEXT:    v_mov_b32_e32 v15, v0
 125 ; GFX1030-NEXT:    v_mov_b32_e32 v16, v1
 126 ; GFX1030-NEXT:    v_mov_b32_e32 v17, v2
 127 ; GFX1030-NEXT:    v_mov_b32_e32 v18, v3
 128 ; GFX1030-NEXT:    v_mov_b32_e32 v19, v4
 129 ; GFX1030-NEXT:    v_mov_b32_e32 v20, v5
 130 ; GFX1030-NEXT:    v_mov_b32_e32 v21, v6
 131 ; GFX1030-NEXT:    v_mov_b32_e32 v22, v7
 132 ; GFX1030-NEXT:    v_mov_b32_e32 v23, v8
 133 ; GFX1030-NEXT:    v_mov_b32_e32 v24, v9
 134 ; GFX1030-NEXT:    v_mov_b32_e32 v25, v10
 135 ; GFX1030-NEXT:    s_mov_b32 s1, exec_lo
 136 ; GFX1030-NEXT:  .LBB6_1: ; =>This Inner Loop Header: Depth=1
 137 ; GFX1030-NEXT:    v_readfirstlane_b32 s4, v11
 138 ; GFX1030-NEXT:    v_readfirstlane_b32 s5, v12
 139 ; GFX1030-NEXT:    v_readfirstlane_b32 s6, v13
 140 ; GFX1030-NEXT:    v_readfirstlane_b32 s7, v14
 141 ; GFX1030-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[11:12]
 142 ; GFX1030-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[13:14]
 143 ; GFX1030-NEXT:    s_and_b32 s0, s0, vcc_lo
 144 ; GFX1030-NEXT:    s_and_saveexec_b32 s0, s0
 145 ; GFX1030-NEXT:    image_bvh_intersect_ray v[0:3], v[15:30], s[4:7]
 146 ; GFX1030-NEXT:    ; implicit-def: $vgpr11_vgpr12
 147 ; GFX1030-NEXT:    ; implicit-def: $vgpr15
 148 ; GFX1030-NEXT:    ; implicit-def: $vgpr16
 149 ; GFX1030-NEXT:    ; implicit-def: $vgpr17
 150 ; GFX1030-NEXT:    ; implicit-def: $vgpr18
 151 ; GFX1030-NEXT:    ; implicit-def: $vgpr19
 152 ; GFX1030-NEXT:    ; implicit-def: $vgpr20
 153 ; GFX1030-NEXT:    ; implicit-def: $vgpr21
 154 ; GFX1030-NEXT:    ; implicit-def: $vgpr22
 155 ; GFX1030-NEXT:    ; implicit-def: $vgpr23
 156 ; GFX1030-NEXT:    ; implicit-def: $vgpr24
 157 ; GFX1030-NEXT:    ; implicit-def: $vgpr25
 158 ; GFX1030-NEXT:    ; implicit-def: $vgpr11_vgpr12_vgpr13_vgpr14
 159 ; GFX1030-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 160 ; GFX1030-NEXT:    s_cbranch_execnz .LBB6_1
 161 ; GFX1030-NEXT:  ; %bb.2:
 162 ; GFX1030-NEXT:    s_mov_b32 exec_lo, s1
 163 ; GFX1030-NEXT:    s_waitcnt vmcnt(0)
 164 ; GFX1030-NEXT:    ; return to shader part epilog
 165 ;
 166 ; GFX1013-LABEL: image_bvh_intersect_ray_vgpr_descr:
 167 ; GFX1013:       ; %bb.0:
 168 ; GFX1013-NEXT:    v_mov_b32_e32 v16, v11
 169 ; GFX1013-NEXT:    v_mov_b32_e32 v17, v12
 170 ; GFX1013-NEXT:    v_mov_b32_e32 v18, v13
 171 ; GFX1013-NEXT:    v_mov_b32_e32 v19, v14
 172 ; GFX1013-NEXT:    s_mov_b32 s1, exec_lo
 173 ; GFX1013-NEXT:  .LBB6_1: ; =>This Inner Loop Header: Depth=1
 174 ; GFX1013-NEXT:    v_readfirstlane_b32 s4, v16
 175 ; GFX1013-NEXT:    v_readfirstlane_b32 s5, v17
 176 ; GFX1013-NEXT:    v_readfirstlane_b32 s6, v18
 177 ; GFX1013-NEXT:    v_readfirstlane_b32 s7, v19
 178 ; GFX1013-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[16:17]
 179 ; GFX1013-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[18:19]
 180 ; GFX1013-NEXT:    s_and_b32 s0, s0, vcc_lo
 181 ; GFX1013-NEXT:    s_and_saveexec_b32 s0, s0
 182 ; GFX1013-NEXT:    image_bvh_intersect_ray v[20:23], v[0:15], s[4:7]
 183 ; GFX1013-NEXT:    ; implicit-def: $vgpr16_vgpr17
 184 ; GFX1013-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 185 ; GFX1013-NEXT:    ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19
 186 ; GFX1013-NEXT:    s_waitcnt_depctr 0xffe3
 187 ; GFX1013-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 188 ; GFX1013-NEXT:    s_cbranch_execnz .LBB6_1
 189 ; GFX1013-NEXT:  ; %bb.2:
 190 ; GFX1013-NEXT:    s_mov_b32 exec_lo, s1
 191 ; GFX1013-NEXT:    s_waitcnt vmcnt(0)
 192 ; GFX1013-NEXT:    v_mov_b32_e32 v0, v20
 193 ; GFX1013-NEXT:    v_mov_b32_e32 v1, v21
 194 ; GFX1013-NEXT:    v_mov_b32_e32 v2, v22
 195 ; GFX1013-NEXT:    v_mov_b32_e32 v3, v23
 196 ; GFX1013-NEXT:    ; return to shader part epilog
 197   %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr)
 198   %r = bitcast <4 x i32> %v to <4 x float>
 199   ret <4 x float> %r
 200 }
 201
 202 define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16_vgpr_descr(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr) {
 203 ; GFX1030-LABEL: image_bvh_intersect_ray_a16_vgpr_descr:
 204 ; GFX1030:       ; %bb.0:
 205 ; GFX1030-NEXT:    s_mov_b32 s0, 0xffff
 206 ; GFX1030-NEXT:    v_mov_b32_e32 v13, v0
 207 ; GFX1030-NEXT:    v_mov_b32_e32 v14, v1
 208 ; GFX1030-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
 209 ; GFX1030-NEXT:    v_and_b32_e32 v1, s0, v7
 210 ; GFX1030-NEXT:    v_mov_b32_e32 v15, v2
 211 ; GFX1030-NEXT:    v_and_b32_e32 v2, s0, v8
 212 ; GFX1030-NEXT:    v_mov_b32_e32 v16, v3
 213 ; GFX1030-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 214 ; GFX1030-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 215 ; GFX1030-NEXT:    v_mov_b32_e32 v17, v4
 216 ; GFX1030-NEXT:    v_alignbit_b32 v20, v2, v7, 16
 217 ; GFX1030-NEXT:    s_mov_b32 s1, exec_lo
 218 ; GFX1030-NEXT:    v_and_or_b32 v18, v5, s0, v0
 219 ; GFX1030-NEXT:    v_and_or_b32 v19, v6, s0, v1
 220 ; GFX1030-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 221 ; GFX1030-NEXT:    v_readfirstlane_b32 s4, v9
 222 ; GFX1030-NEXT:    v_readfirstlane_b32 s5, v10
 223 ; GFX1030-NEXT:    v_readfirstlane_b32 s6, v11
 224 ; GFX1030-NEXT:    v_readfirstlane_b32 s7, v12
 225 ; GFX1030-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
 226 ; GFX1030-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[11:12]
 227 ; GFX1030-NEXT:    s_and_b32 s0, s0, vcc_lo
 228 ; GFX1030-NEXT:    s_and_saveexec_b32 s0, s0
 229 ; GFX1030-NEXT:    image_bvh_intersect_ray v[0:3], v[13:20], s[4:7] a16
 230 ; GFX1030-NEXT:    ; implicit-def: $vgpr9_vgpr10
 231 ; GFX1030-NEXT:    ; implicit-def: $vgpr13
 232 ; GFX1030-NEXT:    ; implicit-def: $vgpr14
 233 ; GFX1030-NEXT:    ; implicit-def: $vgpr15
 234 ; GFX1030-NEXT:    ; implicit-def: $vgpr16
 235 ; GFX1030-NEXT:    ; implicit-def: $vgpr17
 236 ; GFX1030-NEXT:    ; implicit-def: $vgpr18
 237 ; GFX1030-NEXT:    ; implicit-def: $vgpr19
 238 ; GFX1030-NEXT:    ; implicit-def: $vgpr20
 239 ; GFX1030-NEXT:    ; implicit-def: $vgpr9_vgpr10_vgpr11_vgpr12
 240 ; GFX1030-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 241 ; GFX1030-NEXT:    s_cbranch_execnz .LBB7_1
 242 ; GFX1030-NEXT:  ; %bb.2:
 243 ; GFX1030-NEXT:    s_mov_b32 exec_lo, s1
 244 ; GFX1030-NEXT:    s_waitcnt vmcnt(0)
 245 ; GFX1030-NEXT:    ; return to shader part epilog
 246 ;
 247 ; GFX1013-LABEL: image_bvh_intersect_ray_a16_vgpr_descr:
 248 ; GFX1013:       ; %bb.0:
 249 ; GFX1013-NEXT:    s_mov_b32 s0, 0xffff
 250 ; GFX1013-NEXT:    v_lshrrev_b32_e32 v13, 16, v5
 251 ; GFX1013-NEXT:    v_and_b32_e32 v14, s0, v7
 252 ; GFX1013-NEXT:    v_and_b32_e32 v8, s0, v8
 253 ; GFX1013-NEXT:    s_mov_b32 s1, exec_lo
 254 ; GFX1013-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
 255 ; GFX1013-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
 256 ; GFX1013-NEXT:    v_alignbit_b32 v7, v8, v7, 16
 257 ; GFX1013-NEXT:    v_and_or_b32 v5, v5, s0, v13
 258 ; GFX1013-NEXT:    v_and_or_b32 v6, v6, s0, v14
 259 ; GFX1013-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 260 ; GFX1013-NEXT:    v_readfirstlane_b32 s4, v9
 261 ; GFX1013-NEXT:    v_readfirstlane_b32 s5, v10
 262 ; GFX1013-NEXT:    v_readfirstlane_b32 s6, v11
 263 ; GFX1013-NEXT:    v_readfirstlane_b32 s7, v12
 264 ; GFX1013-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
 265 ; GFX1013-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[11:12]
 266 ; GFX1013-NEXT:    s_and_b32 s0, s0, vcc_lo
 267 ; GFX1013-NEXT:    s_and_saveexec_b32 s0, s0
 268 ; GFX1013-NEXT:    image_bvh_intersect_ray v[13:16], v[0:7], s[4:7] a16
 269 ; GFX1013-NEXT:    ; implicit-def: $vgpr9_vgpr10
 270 ; GFX1013-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
 271 ; GFX1013-NEXT:    ; implicit-def: $vgpr9_vgpr10_vgpr11_vgpr12
 272 ; GFX1013-NEXT:    s_waitcnt_depctr 0xffe3
 273 ; GFX1013-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 274 ; GFX1013-NEXT:    s_cbranch_execnz .LBB7_1
 275 ; GFX1013-NEXT:  ; %bb.2:
 276 ; GFX1013-NEXT:    s_mov_b32 exec_lo, s1
 277 ; GFX1013-NEXT:    s_waitcnt vmcnt(0)
 278 ; GFX1013-NEXT:    v_mov_b32_e32 v0, v13
 279 ; GFX1013-NEXT:    v_mov_b32_e32 v1, v14
 280 ; GFX1013-NEXT:    v_mov_b32_e32 v2, v15
 281 ; GFX1013-NEXT:    v_mov_b32_e32 v3, v16
 282 ; GFX1013-NEXT:    ; return to shader part epilog
 283   %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr)
 284   %r = bitcast <4 x i32> %v to <4 x float>
 285   ret <4 x float> %r
 286 }
 287
 288 define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_vgpr_descr(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr) {
 289 ; GFX1030-LABEL: image_bvh64_intersect_ray_vgpr_descr:
 290 ; GFX1030:       ; %bb.0:
 291 ; GFX1030-NEXT:    v_mov_b32_e32 v16, v0
 292 ; GFX1030-NEXT:    v_mov_b32_e32 v17, v1
 293 ; GFX1030-NEXT:    v_mov_b32_e32 v18, v2
 294 ; GFX1030-NEXT:    v_mov_b32_e32 v19, v3
 295 ; GFX1030-NEXT:    v_mov_b32_e32 v20, v4
 296 ; GFX1030-NEXT:    v_mov_b32_e32 v21, v5
 297 ; GFX1030-NEXT:    v_mov_b32_e32 v22, v6
 298 ; GFX1030-NEXT:    v_mov_b32_e32 v23, v7
 299 ; GFX1030-NEXT:    v_mov_b32_e32 v24, v8
 300 ; GFX1030-NEXT:    v_mov_b32_e32 v25, v9
 301 ; GFX1030-NEXT:    v_mov_b32_e32 v26, v10
 302 ; GFX1030-NEXT:    v_mov_b32_e32 v27, v11
 303 ; GFX1030-NEXT:    s_mov_b32 s1, exec_lo
 304 ; GFX1030-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
 305 ; GFX1030-NEXT:    v_readfirstlane_b32 s4, v12
 306 ; GFX1030-NEXT:    v_readfirstlane_b32 s5, v13
 307 ; GFX1030-NEXT:    v_readfirstlane_b32 s6, v14
 308 ; GFX1030-NEXT:    v_readfirstlane_b32 s7, v15
 309 ; GFX1030-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[12:13]
 310 ; GFX1030-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[14:15]
 311 ; GFX1030-NEXT:    s_and_b32 s0, s0, vcc_lo
 312 ; GFX1030-NEXT:    s_and_saveexec_b32 s0, s0
 313 ; GFX1030-NEXT:    image_bvh64_intersect_ray v[0:3], v[16:31], s[4:7]
 314 ; GFX1030-NEXT:    ; implicit-def: $vgpr12_vgpr13
 315 ; GFX1030-NEXT:    ; implicit-def: $vgpr16
 316 ; GFX1030-NEXT:    ; implicit-def: $vgpr17
 317 ; GFX1030-NEXT:    ; implicit-def: $vgpr18
 318 ; GFX1030-NEXT:    ; implicit-def: $vgpr19
 319 ; GFX1030-NEXT:    ; implicit-def: $vgpr20
 320 ; GFX1030-NEXT:    ; implicit-def: $vgpr21
 321 ; GFX1030-NEXT:    ; implicit-def: $vgpr22
 322 ; GFX1030-NEXT:    ; implicit-def: $vgpr23
 323 ; GFX1030-NEXT:    ; implicit-def: $vgpr24
 324 ; GFX1030-NEXT:    ; implicit-def: $vgpr25
 325 ; GFX1030-NEXT:    ; implicit-def: $vgpr26
 326 ; GFX1030-NEXT:    ; implicit-def: $vgpr27
 327 ; GFX1030-NEXT:    ; implicit-def: $vgpr12_vgpr13_vgpr14_vgpr15
 328 ; GFX1030-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 329 ; GFX1030-NEXT:    s_cbranch_execnz .LBB8_1
 330 ; GFX1030-NEXT:  ; %bb.2:
 331 ; GFX1030-NEXT:    s_mov_b32 exec_lo, s1
 332 ; GFX1030-NEXT:    s_waitcnt vmcnt(0)
 333 ; GFX1030-NEXT:    ; return to shader part epilog
 334 ;
 335 ; GFX1013-LABEL: image_bvh64_intersect_ray_vgpr_descr:
 336 ; GFX1013:       ; %bb.0:
 337 ; GFX1013-NEXT:    v_mov_b32_e32 v16, v12
 338 ; GFX1013-NEXT:    v_mov_b32_e32 v17, v13
 339 ; GFX1013-NEXT:    v_mov_b32_e32 v18, v14
 340 ; GFX1013-NEXT:    v_mov_b32_e32 v19, v15
 341 ; GFX1013-NEXT:    s_mov_b32 s1, exec_lo
 342 ; GFX1013-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
 343 ; GFX1013-NEXT:    v_readfirstlane_b32 s4, v16
 344 ; GFX1013-NEXT:    v_readfirstlane_b32 s5, v17
 345 ; GFX1013-NEXT:    v_readfirstlane_b32 s6, v18
 346 ; GFX1013-NEXT:    v_readfirstlane_b32 s7, v19
 347 ; GFX1013-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[16:17]
 348 ; GFX1013-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[18:19]
 349 ; GFX1013-NEXT:    s_and_b32 s0, s0, vcc_lo
 350 ; GFX1013-NEXT:    s_and_saveexec_b32 s0, s0
 351 ; GFX1013-NEXT:    image_bvh64_intersect_ray v[20:23], v[0:15], s[4:7]
 352 ; GFX1013-NEXT:    ; implicit-def: $vgpr16_vgpr17
 353 ; GFX1013-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 354 ; GFX1013-NEXT:    ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19
 355 ; GFX1013-NEXT:    s_waitcnt_depctr 0xffe3
 356 ; GFX1013-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 357 ; GFX1013-NEXT:    s_cbranch_execnz .LBB8_1
 358 ; GFX1013-NEXT:  ; %bb.2:
 359 ; GFX1013-NEXT:    s_mov_b32 exec_lo, s1
 360 ; GFX1013-NEXT:    s_waitcnt vmcnt(0)
 361 ; GFX1013-NEXT:    v_mov_b32_e32 v0, v20
 362 ; GFX1013-NEXT:    v_mov_b32_e32 v1, v21
 363 ; GFX1013-NEXT:    v_mov_b32_e32 v2, v22
 364 ; GFX1013-NEXT:    v_mov_b32_e32 v3, v23
 365 ; GFX1013-NEXT:    ; return to shader part epilog
 366   %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr)
 367   %r = bitcast <4 x i32> %v to <4 x float>
 368   ret <4 x float> %r
 369 }
 370
 371 define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16_vgpr_descr(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr) {
 372 ; GFX1030-LABEL: image_bvh64_intersect_ray_a16_vgpr_descr:
 373 ; GFX1030:       ; %bb.0:
 374 ; GFX1030-NEXT:    s_mov_b32 s0, 0xffff
 375 ; GFX1030-NEXT:    v_mov_b32_e32 v14, v0
 376 ; GFX1030-NEXT:    v_mov_b32_e32 v15, v1
 377 ; GFX1030-NEXT:    v_lshrrev_b32_e32 v0, 16, v6
 378 ; GFX1030-NEXT:    v_and_b32_e32 v1, s0, v8
 379 ; GFX1030-NEXT:    v_mov_b32_e32 v16, v2
 380 ; GFX1030-NEXT:    v_and_b32_e32 v2, s0, v9
 381 ; GFX1030-NEXT:    v_mov_b32_e32 v17, v3
 382 ; GFX1030-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 383 ; GFX1030-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 384 ; GFX1030-NEXT:    v_mov_b32_e32 v18, v4
 385 ; GFX1030-NEXT:    v_mov_b32_e32 v19, v5
 386 ; GFX1030-NEXT:    v_alignbit_b32 v22, v2, v8, 16
 387 ; GFX1030-NEXT:    v_and_or_b32 v20, v6, s0, v0
 388 ; GFX1030-NEXT:    v_and_or_b32 v21, v7, s0, v1
 389 ; GFX1030-NEXT:    s_mov_b32 s1, exec_lo
 390 ; GFX1030-NEXT:  .LBB9_1: ; =>This Inner Loop Header: Depth=1
 391 ; GFX1030-NEXT:    v_readfirstlane_b32 s4, v10
 392 ; GFX1030-NEXT:    v_readfirstlane_b32 s5, v11
 393 ; GFX1030-NEXT:    v_readfirstlane_b32 s6, v12
 394 ; GFX1030-NEXT:    v_readfirstlane_b32 s7, v13
 395 ; GFX1030-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[10:11]
 396 ; GFX1030-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[12:13]
 397 ; GFX1030-NEXT:    s_and_b32 s0, s0, vcc_lo
 398 ; GFX1030-NEXT:    s_and_saveexec_b32 s0, s0
 399 ; GFX1030-NEXT:    image_bvh64_intersect_ray v[0:3], v[14:29], s[4:7] a16
 400 ; GFX1030-NEXT:    ; implicit-def: $vgpr10_vgpr11
 401 ; GFX1030-NEXT:    ; implicit-def: $vgpr14
 402 ; GFX1030-NEXT:    ; implicit-def: $vgpr15
 403 ; GFX1030-NEXT:    ; implicit-def: $vgpr16
 404 ; GFX1030-NEXT:    ; implicit-def: $vgpr17
 405 ; GFX1030-NEXT:    ; implicit-def: $vgpr18
 406 ; GFX1030-NEXT:    ; implicit-def: $vgpr19
 407 ; GFX1030-NEXT:    ; implicit-def: $vgpr20
 408 ; GFX1030-NEXT:    ; implicit-def: $vgpr21
 409 ; GFX1030-NEXT:    ; implicit-def: $vgpr22
 410 ; GFX1030-NEXT:    ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13
 411 ; GFX1030-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 412 ; GFX1030-NEXT:    s_cbranch_execnz .LBB9_1
 413 ; GFX1030-NEXT:  ; %bb.2:
 414 ; GFX1030-NEXT:    s_mov_b32 exec_lo, s1
 415 ; GFX1030-NEXT:    s_waitcnt vmcnt(0)
 416 ; GFX1030-NEXT:    ; return to shader part epilog
 417 ;
 418 ; GFX1013-LABEL: image_bvh64_intersect_ray_a16_vgpr_descr:
 419 ; GFX1013:       ; %bb.0:
 420 ; GFX1013-NEXT:    s_mov_b32 s0, 0xffff
 421 ; GFX1013-NEXT:    v_mov_b32_e32 v16, v10
 422 ; GFX1013-NEXT:    v_mov_b32_e32 v17, v11
 423 ; GFX1013-NEXT:    v_lshrrev_b32_e32 v10, 16, v6
 424 ; GFX1013-NEXT:    v_and_b32_e32 v11, s0, v8
 425 ; GFX1013-NEXT:    v_and_b32_e32 v9, s0, v9
 426 ; GFX1013-NEXT:    v_mov_b32_e32 v18, v12
 427 ; GFX1013-NEXT:    v_mov_b32_e32 v19, v13
 428 ; GFX1013-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
 429 ; GFX1013-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
 430 ; GFX1013-NEXT:    v_alignbit_b32 v8, v9, v8, 16
 431 ; GFX1013-NEXT:    s_mov_b32 s1, exec_lo
 432 ; GFX1013-NEXT:    v_and_or_b32 v6, v6, s0, v10
 433 ; GFX1013-NEXT:    v_and_or_b32 v7, v7, s0, v11
 434 ; GFX1013-NEXT:  .LBB9_1: ; =>This Inner Loop Header: Depth=1
 435 ; GFX1013-NEXT:    v_readfirstlane_b32 s4, v16
 436 ; GFX1013-NEXT:    v_readfirstlane_b32 s5, v17
 437 ; GFX1013-NEXT:    v_readfirstlane_b32 s6, v18
 438 ; GFX1013-NEXT:    v_readfirstlane_b32 s7, v19
 439 ; GFX1013-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[16:17]
 440 ; GFX1013-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[18:19]
 441 ; GFX1013-NEXT:    s_and_b32 s0, s0, vcc_lo
 442 ; GFX1013-NEXT:    s_and_saveexec_b32 s0, s0
 443 ; GFX1013-NEXT:    image_bvh64_intersect_ray v[20:23], v[0:15], s[4:7] a16
 444 ; GFX1013-NEXT:    ; implicit-def: $vgpr16_vgpr17
 445 ; GFX1013-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 446 ; GFX1013-NEXT:    ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19
 447 ; GFX1013-NEXT:    s_waitcnt_depctr 0xffe3
 448 ; GFX1013-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 449 ; GFX1013-NEXT:    s_cbranch_execnz .LBB9_1
 450 ; GFX1013-NEXT:  ; %bb.2:
 451 ; GFX1013-NEXT:    s_mov_b32 exec_lo, s1
 452 ; GFX1013-NEXT:    s_waitcnt vmcnt(0)
 453 ; GFX1013-NEXT:    v_mov_b32_e32 v0, v20
 454 ; GFX1013-NEXT:    v_mov_b32_e32 v1, v21
 455 ; GFX1013-NEXT:    v_mov_b32_e32 v2, v22
 456 ; GFX1013-NEXT:    v_mov_b32_e32 v3, v23
 457 ; GFX1013-NEXT:    ; return to shader part epilog
 458   %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr)
 459   %r = bitcast <4 x i32> %v to <4 x float>
 460   ret <4 x float> %r
 461 }
 462
 463 define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(i32* %p_node_ptr, float* %p_ray, <4 x i32> inreg %tdescr) {
 464 ; GFX1030-LABEL: image_bvh_intersect_ray_nsa_reassign:
 465 ; GFX1030:       ; %bb.0:
 466 ; GFX1030-NEXT:    s_clause 0x1
 467 ; GFX1030-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 468 ; GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x34
 469 ; GFX1030-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
 470 ; GFX1030-NEXT:    v_mov_b32_e32 v5, 0x40400000
 471 ; GFX1030-NEXT:    v_mov_b32_e32 v6, 4.0
 472 ; GFX1030-NEXT:    v_mov_b32_e32 v7, 0x40a00000
 473 ; GFX1030-NEXT:    v_mov_b32_e32 v8, 0x40c00000
 474 ; GFX1030-NEXT:    v_mov_b32_e32 v9, 0x40e00000
 475 ; GFX1030-NEXT:    v_mov_b32_e32 v10, 0x41000000
 476 ; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
 477 ; GFX1030-NEXT:    v_mov_b32_e32 v0, s4
 478 ; GFX1030-NEXT:    v_mov_b32_e32 v1, s5
 479 ; GFX1030-NEXT:    v_mov_b32_e32 v2, s6
 480 ; GFX1030-NEXT:    v_mov_b32_e32 v3, s7
 481 ; GFX1030-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
 482 ; GFX1030-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 483 ; GFX1030-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
 484 ; GFX1030-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
 485 ; GFX1030-NEXT:    v_mov_b32_e32 v4, 2.0
 486 ; GFX1030-NEXT:    flat_load_dword v0, v[0:1]
 487 ; GFX1030-NEXT:    flat_load_dword v1, v[2:3]
 488 ; GFX1030-NEXT:    v_mov_b32_e32 v2, 0
 489 ; GFX1030-NEXT:    v_mov_b32_e32 v3, 1.0
 490 ; GFX1030-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 491 ; GFX1030-NEXT:    image_bvh_intersect_ray v[0:3], v[0:15], s[0:3]
 492 ; GFX1030-NEXT:    s_waitcnt vmcnt(0)
 493 ; GFX1030-NEXT:    flat_store_dwordx4 v[0:1], v[0:3]
 494 ; GFX1030-NEXT:    s_endpgm
 495 ;
 496 ; GFX1013-LABEL: image_bvh_intersect_ray_nsa_reassign:
 497 ; GFX1013:       ; %bb.0:
 498 ; GFX1013-NEXT:    s_clause 0x1
 499 ; GFX1013-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 500 ; GFX1013-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
 501 ; GFX1013-NEXT:    v_lshlrev_b32_e32 v6, 2, v0
 502 ; GFX1013-NEXT:    v_mov_b32_e32 v7, 0x40a00000
 503 ; GFX1013-NEXT:    v_mov_b32_e32 v8, 0x40c00000
 504 ; GFX1013-NEXT:    v_mov_b32_e32 v9, 0x40e00000
 505 ; GFX1013-NEXT:    v_mov_b32_e32 v10, 0x41000000
 506 ; GFX1013-NEXT:    s_waitcnt lgkmcnt(0)
 507 ; GFX1013-NEXT:    v_mov_b32_e32 v0, s4
 508 ; GFX1013-NEXT:    v_mov_b32_e32 v1, s5
 509 ; GFX1013-NEXT:    v_mov_b32_e32 v2, s6
 510 ; GFX1013-NEXT:    v_mov_b32_e32 v3, s7
 511 ; GFX1013-NEXT:    v_add_co_u32 v4, vcc_lo, v0, v6
 512 ; GFX1013-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
 513 ; GFX1013-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v6
 514 ; GFX1013-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
 515 ; GFX1013-NEXT:    v_mov_b32_e32 v6, 4.0
 516 ; GFX1013-NEXT:    flat_load_dword v0, v[4:5]
 517 ; GFX1013-NEXT:    flat_load_dword v1, v[2:3]
 518 ; GFX1013-NEXT:    v_mov_b32_e32 v2, 0
 519 ; GFX1013-NEXT:    v_mov_b32_e32 v3, 1.0
 520 ; GFX1013-NEXT:    v_mov_b32_e32 v4, 2.0
 521 ; GFX1013-NEXT:    v_mov_b32_e32 v5, 0x40400000
 522 ; GFX1013-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 523 ; GFX1013-NEXT:    image_bvh_intersect_ray v[0:3], v[0:15], s[8:11]
 524 ; GFX1013-NEXT:    s_waitcnt vmcnt(0)
 525 ; GFX1013-NEXT:    flat_store_dwordx4 v[0:1], v[0:3]
 526 ; GFX1013-NEXT:    s_endpgm
 527   %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
 528   %gep_node_ptr = getelementptr inbounds i32, i32* %p_node_ptr, i32 %lid
 529   %node_ptr = load i32, i32* %gep_node_ptr, align 4
 530   %gep_ray = getelementptr inbounds float, float* %p_ray, i32 %lid
 531   %ray_extent = load float, float* %gep_ray, align 4
 532   %ray_origin0 = insertelement <3 x float> undef, float 0.0, i32 0
 533   %ray_origin1 = insertelement <3 x float> %ray_origin0, float 1.0, i32 1
 534   %ray_origin = insertelement <3 x float> %ray_origin1, float 2.0, i32 2
 535   %ray_dir0 = insertelement <3 x float> undef, float 3.0, i32 0
 536   %ray_dir1 = insertelement <3 x float> %ray_dir0, float 4.0, i32 1
 537   %ray_dir = insertelement <3 x float> %ray_dir1, float 5.0, i32 2
 538   %ray_inv_dir0 = insertelement <3 x float> undef, float 6.0, i32 0
 539   %ray_inv_dir1 = insertelement <3 x float> %ray_inv_dir0, float 7.0, i32 1
 540   %ray_inv_dir = insertelement <3 x float> %ray_inv_dir1, float 8.0, i32 2
 541   %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr)
 542   store <4 x i32> %v, <4 x i32>* undef
 543   ret void
 544 }
 545
 546 define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(i32* %p_node_ptr, float* %p_ray, <4 x i32> inreg %tdescr) {
 547 ; GFX1030-LABEL: image_bvh_intersect_ray_a16_nsa_reassign:
 548 ; GFX1030:       ; %bb.0:
 549 ; GFX1030-NEXT:    s_clause 0x1
 550 ; GFX1030-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 551 ; GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x34
 552 ; GFX1030-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
 553 ; GFX1030-NEXT:    s_movk_i32 s9, 0x4600
 554 ; GFX1030-NEXT:    s_movk_i32 s8, 0x4700
 555 ; GFX1030-NEXT:    s_bfe_u32 s8, s8, 0x100000
 556 ; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
 557 ; GFX1030-NEXT:    v_mov_b32_e32 v0, s4
 558 ; GFX1030-NEXT:    v_mov_b32_e32 v1, s5
 559 ; GFX1030-NEXT:    v_mov_b32_e32 v2, s6
 560 ; GFX1030-NEXT:    v_mov_b32_e32 v3, s7
 561 ; GFX1030-NEXT:    s_movk_i32 s5, 0x4400
 562 ; GFX1030-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
 563 ; GFX1030-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 564 ; GFX1030-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
 565 ; GFX1030-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
 566 ; GFX1030-NEXT:    s_movk_i32 s6, 0x4200
 567 ; GFX1030-NEXT:    flat_load_dword v0, v[0:1]
 568 ; GFX1030-NEXT:    flat_load_dword v1, v[2:3]
 569 ; GFX1030-NEXT:    s_bfe_u32 s5, s5, 0x100000
 570 ; GFX1030-NEXT:    s_movk_i32 s7, 0x4800
 571 ; GFX1030-NEXT:    s_bfe_u32 s6, s6, 0x100000
 572 ; GFX1030-NEXT:    s_lshl_b32 s5, s5, 16
 573 ; GFX1030-NEXT:    s_movk_i32 s4, 0x4500
 574 ; GFX1030-NEXT:    s_or_b32 s5, s6, s5
 575 ; GFX1030-NEXT:    s_bfe_u32 s6, s9, 0x100000
 576 ; GFX1030-NEXT:    s_bfe_u32 s7, s7, 0x100000
 577 ; GFX1030-NEXT:    s_bfe_u32 s4, s4, 0x100000
 578 ; GFX1030-NEXT:    s_lshl_b32 s6, s6, 16
 579 ; GFX1030-NEXT:    s_lshl_b32 s7, s7, 16
 580 ; GFX1030-NEXT:    s_or_b32 s4, s4, s6
 581 ; GFX1030-NEXT:    s_or_b32 s6, s8, s7
 582 ; GFX1030-NEXT:    v_mov_b32_e32 v2, 0
 583 ; GFX1030-NEXT:    v_mov_b32_e32 v3, 1.0
 584 ; GFX1030-NEXT:    v_mov_b32_e32 v4, 2.0
 585 ; GFX1030-NEXT:    v_mov_b32_e32 v5, s5
 586 ; GFX1030-NEXT:    v_mov_b32_e32 v6, s4
 587 ; GFX1030-NEXT:    v_mov_b32_e32 v7, s6
 588 ; GFX1030-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 589 ; GFX1030-NEXT:    image_bvh_intersect_ray v[0:3], v[0:7], s[0:3] a16
 590 ; GFX1030-NEXT:    s_waitcnt vmcnt(0)
 591 ; GFX1030-NEXT:    flat_store_dwordx4 v[0:1], v[0:3]
 592 ; GFX1030-NEXT:    s_endpgm
 593 ;
 594 ; GFX1013-LABEL: image_bvh_intersect_ray_a16_nsa_reassign:
 595 ; GFX1013:       ; %bb.0:
 596 ; GFX1013-NEXT:    s_clause 0x1
 597 ; GFX1013-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 598 ; GFX1013-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
 599 ; GFX1013-NEXT:    v_lshlrev_b32_e32 v6, 2, v0
 600 ; GFX1013-NEXT:    s_movk_i32 s1, 0x4400
 601 ; GFX1013-NEXT:    s_movk_i32 s2, 0x4200
 602 ; GFX1013-NEXT:    s_bfe_u32 s1, s1, 0x100000
 603 ; GFX1013-NEXT:    s_movk_i32 s3, 0x4800
 604 ; GFX1013-NEXT:    s_bfe_u32 s2, s2, 0x100000
 605 ; GFX1013-NEXT:    s_lshl_b32 s1, s1, 16
 606 ; GFX1013-NEXT:    s_movk_i32 s0, 0x4500
 607 ; GFX1013-NEXT:    s_or_b32 s1, s2, s1
 608 ; GFX1013-NEXT:    s_bfe_u32 s3, s3, 0x100000
 609 ; GFX1013-NEXT:    s_bfe_u32 s0, s0, 0x100000
 610 ; GFX1013-NEXT:    s_lshl_b32 s3, s3, 16
 611 ; GFX1013-NEXT:    s_waitcnt lgkmcnt(0)
 612 ; GFX1013-NEXT:    v_mov_b32_e32 v0, s4
 613 ; GFX1013-NEXT:    v_mov_b32_e32 v1, s5
 614 ; GFX1013-NEXT:    v_mov_b32_e32 v2, s6
 615 ; GFX1013-NEXT:    v_mov_b32_e32 v3, s7
 616 ; GFX1013-NEXT:    s_movk_i32 s5, 0x4600
 617 ; GFX1013-NEXT:    v_add_co_u32 v4, vcc_lo, v0, v6
 618 ; GFX1013-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
 619 ; GFX1013-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v6
 620 ; GFX1013-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
 621 ; GFX1013-NEXT:    s_movk_i32 s4, 0x4700
 622 ; GFX1013-NEXT:    flat_load_dword v0, v[4:5]
 623 ; GFX1013-NEXT:    flat_load_dword v1, v[2:3]
 624 ; GFX1013-NEXT:    s_bfe_u32 s2, s5, 0x100000
 625 ; GFX1013-NEXT:    s_bfe_u32 s4, s4, 0x100000
 626 ; GFX1013-NEXT:    s_lshl_b32 s2, s2, 16
 627 ; GFX1013-NEXT:    v_mov_b32_e32 v2, 0
 628 ; GFX1013-NEXT:    s_or_b32 s0, s0, s2
 629 ; GFX1013-NEXT:    s_or_b32 s2, s4, s3
 630 ; GFX1013-NEXT:    v_mov_b32_e32 v3, 1.0
 631 ; GFX1013-NEXT:    v_mov_b32_e32 v4, 2.0
 632 ; GFX1013-NEXT:    v_mov_b32_e32 v5, s1
 633 ; GFX1013-NEXT:    v_mov_b32_e32 v6, s0
 634 ; GFX1013-NEXT:    v_mov_b32_e32 v7, s2
 635 ; GFX1013-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 636 ; GFX1013-NEXT:    image_bvh_intersect_ray v[0:3], v[0:7], s[8:11] a16
 637 ; GFX1013-NEXT:    s_waitcnt vmcnt(0)
 638 ; GFX1013-NEXT:    flat_store_dwordx4 v[0:1], v[0:3]
 639 ; GFX1013-NEXT:    s_endpgm
 640   %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
 641   %gep_node_ptr = getelementptr inbounds i32, i32* %p_node_ptr, i32 %lid
 642   %node_ptr = load i32, i32* %gep_node_ptr, align 4
 643   %gep_ray = getelementptr inbounds float, float* %p_ray, i32 %lid
 644   %ray_extent = load float, float* %gep_ray, align 4
 645   %ray_origin0 = insertelement <3 x float> undef, float 0.0, i32 0
 646   %ray_origin1 = insertelement <3 x float> %ray_origin0, float 1.0, i32 1
 647   %ray_origin = insertelement <3 x float> %ray_origin1, float 2.0, i32 2
 648   %ray_dir0 = insertelement <3 x half> undef, half 3.0, i32 0
 649   %ray_dir1 = insertelement <3 x half> %ray_dir0, half 4.0, i32 1
 650   %ray_dir = insertelement <3 x half> %ray_dir1, half 5.0, i32 2
 651   %ray_inv_dir0 = insertelement <3 x half> undef, half 6.0, i32 0
 652   %ray_inv_dir1 = insertelement <3 x half> %ray_inv_dir0, half 7.0, i32 1
 653   %ray_inv_dir = insertelement <3 x half> %ray_inv_dir1, half 8.0, i32 2
 654   %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr)
 655   store <4 x i32> %v, <4 x i32>* undef
 656   ret void
 657 }
 658
 659 define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(float* %p_ray, <4 x i32> inreg %tdescr) {
 660 ; GFX1030-LABEL: image_bvh64_intersect_ray_nsa_reassign:
 661 ; GFX1030:       ; %bb.0:
 662 ; GFX1030-NEXT:    s_clause 0x1
 663 ; GFX1030-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
 664 ; GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x34
 665 ; GFX1030-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 666 ; GFX1030-NEXT:    v_mov_b32_e32 v3, 0
 667 ; GFX1030-NEXT:    v_mov_b32_e32 v4, 1.0
 668 ; GFX1030-NEXT:    v_mov_b32_e32 v5, 2.0
 669 ; GFX1030-NEXT:    v_mov_b32_e32 v6, 0x40400000
 670 ; GFX1030-NEXT:    v_mov_b32_e32 v7, 4.0
 671 ; GFX1030-NEXT:    v_mov_b32_e32 v8, 0x40a00000
 672 ; GFX1030-NEXT:    v_mov_b32_e32 v9, 0x40c00000
 673 ; GFX1030-NEXT:    v_mov_b32_e32 v10, 0x40e00000
 674 ; GFX1030-NEXT:    v_mov_b32_e32 v11, 0x41000000
 675 ; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
 676 ; GFX1030-NEXT:    v_mov_b32_e32 v0, s4
 677 ; GFX1030-NEXT:    v_mov_b32_e32 v1, s5
 678 ; GFX1030-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
 679 ; GFX1030-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 680 ; GFX1030-NEXT:    flat_load_dword v2, v[0:1]
 681 ; GFX1030-NEXT:    v_mov_b32_e32 v0, 0xb36211c7
 682 ; GFX1030-NEXT:    v_mov_b32_e32 v1, 0x102
 683 ; GFX1030-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 684 ; GFX1030-NEXT:    image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3]
 685 ; GFX1030-NEXT:    s_waitcnt vmcnt(0)
 686 ; GFX1030-NEXT:    flat_store_dwordx4 v[0:1], v[0:3]
 687 ; GFX1030-NEXT:    s_endpgm
 688 ;
 689 ; GFX1013-LABEL: image_bvh64_intersect_ray_nsa_reassign:
 690 ; GFX1013:       ; %bb.0:
 691 ; GFX1013-NEXT:    s_clause 0x1
 692 ; GFX1013-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 693 ; GFX1013-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
 694 ; GFX1013-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 695 ; GFX1013-NEXT:    v_mov_b32_e32 v3, 0
 696 ; GFX1013-NEXT:    v_mov_b32_e32 v4, 1.0
 697 ; GFX1013-NEXT:    v_mov_b32_e32 v5, 2.0
 698 ; GFX1013-NEXT:    v_mov_b32_e32 v6, 0x40400000
 699 ; GFX1013-NEXT:    v_mov_b32_e32 v7, 4.0
 700 ; GFX1013-NEXT:    v_mov_b32_e32 v8, 0x40a00000
 701 ; GFX1013-NEXT:    v_mov_b32_e32 v9, 0x40c00000
 702 ; GFX1013-NEXT:    v_mov_b32_e32 v10, 0x40e00000
 703 ; GFX1013-NEXT:    v_mov_b32_e32 v11, 0x41000000
 704 ; GFX1013-NEXT:    s_waitcnt lgkmcnt(0)
 705 ; GFX1013-NEXT:    v_mov_b32_e32 v0, s2
 706 ; GFX1013-NEXT:    v_mov_b32_e32 v1, s3
 707 ; GFX1013-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
 708 ; GFX1013-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 709 ; GFX1013-NEXT:    flat_load_dword v2, v[0:1]
 710 ; GFX1013-NEXT:    v_mov_b32_e32 v0, 0xb36211c7
 711 ; GFX1013-NEXT:    v_mov_b32_e32 v1, 0x102
 712 ; GFX1013-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 713 ; GFX1013-NEXT:    image_bvh64_intersect_ray v[0:3], v[0:15], s[4:7]
 714 ; GFX1013-NEXT:    s_waitcnt vmcnt(0)
 715 ; GFX1013-NEXT:    flat_store_dwordx4 v[0:1], v[0:3]
 716 ; GFX1013-NEXT:    s_endpgm
 717   %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
 718   %gep_ray = getelementptr inbounds float, float* %p_ray, i32 %lid
 719   %ray_extent = load float, float* %gep_ray, align 4
 720   %ray_origin0 = insertelement <3 x float> undef, float 0.0, i32 0
 721   %ray_origin1 = insertelement <3 x float> %ray_origin0, float 1.0, i32 1
 722   %ray_origin = insertelement <3 x float> %ray_origin1, float 2.0, i32 2
 723   %ray_dir0 = insertelement <3 x float> undef, float 3.0, i32 0
 724   %ray_dir1 = insertelement <3 x float> %ray_dir0, float 4.0, i32 1
 725   %ray_dir = insertelement <3 x float> %ray_dir1, float 5.0, i32 2
 726   %ray_inv_dir0 = insertelement <3 x float> undef, float 6.0, i32 0
 727   %ray_inv_dir1 = insertelement <3 x float> %ray_inv_dir0, float 7.0, i32 1
 728   %ray_inv_dir = insertelement <3 x float> %ray_inv_dir1, float 8.0, i32 2
 729   %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 1111111111111, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr)
 730   store <4 x i32> %v, <4 x i32>* undef
 731   ret void
 732 }
 733
 734 define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(float* %p_ray, <4 x i32> inreg %tdescr) {
 735 ; GFX1030-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign:
 736 ; GFX1030:       ; %bb.0:
 737 ; GFX1030-NEXT:    s_clause 0x1
 738 ; GFX1030-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
 739 ; GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x34
 740 ; GFX1030-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 741 ; GFX1030-NEXT:    s_movk_i32 s6, 0x4200
 742 ; GFX1030-NEXT:    s_movk_i32 s7, 0x4800
 743 ; GFX1030-NEXT:    s_bfe_u32 s6, s6, 0x100000
 744 ; GFX1030-NEXT:    s_movk_i32 s9, 0x4600
 745 ; GFX1030-NEXT:    s_movk_i32 s8, 0x4700
 746 ; GFX1030-NEXT:    s_bfe_u32 s7, s7, 0x100000
 747 ; GFX1030-NEXT:    s_bfe_u32 s8, s8, 0x100000
 748 ; GFX1030-NEXT:    s_lshl_b32 s7, s7, 16
 749 ; GFX1030-NEXT:    v_mov_b32_e32 v3, 0
 750 ; GFX1030-NEXT:    v_mov_b32_e32 v4, 1.0
 751 ; GFX1030-NEXT:    v_mov_b32_e32 v5, 2.0
 752 ; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
 753 ; GFX1030-NEXT:    v_mov_b32_e32 v0, s4
 754 ; GFX1030-NEXT:    v_mov_b32_e32 v1, s5
 755 ; GFX1030-NEXT:    s_movk_i32 s5, 0x4400
 756 ; GFX1030-NEXT:    s_movk_i32 s4, 0x4500
 757 ; GFX1030-NEXT:    s_bfe_u32 s5, s5, 0x100000
 758 ; GFX1030-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
 759 ; GFX1030-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 760 ; GFX1030-NEXT:    s_lshl_b32 s5, s5, 16
 761 ; GFX1030-NEXT:    s_bfe_u32 s4, s4, 0x100000
 762 ; GFX1030-NEXT:    s_or_b32 s5, s6, s5
 763 ; GFX1030-NEXT:    flat_load_dword v2, v[0:1]
 764 ; GFX1030-NEXT:    s_bfe_u32 s6, s9, 0x100000
 765 ; GFX1030-NEXT:    v_mov_b32_e32 v0, 0xb36211c6
 766 ; GFX1030-NEXT:    s_lshl_b32 s6, s6, 16
 767 ; GFX1030-NEXT:    v_mov_b32_e32 v1, 0x102
 768 ; GFX1030-NEXT:    s_or_b32 s4, s4, s6
 769 ; GFX1030-NEXT:    s_or_b32 s6, s8, s7
 770 ; GFX1030-NEXT:    v_mov_b32_e32 v6, s5
 771 ; GFX1030-NEXT:    v_mov_b32_e32 v7, s4
 772 ; GFX1030-NEXT:    v_mov_b32_e32 v8, s6
 773 ; GFX1030-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 774 ; GFX1030-NEXT:    image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3] a16
 775 ; GFX1030-NEXT:    s_waitcnt vmcnt(0)
 776 ; GFX1030-NEXT:    flat_store_dwordx4 v[0:1], v[0:3]
 777 ; GFX1030-NEXT:    s_endpgm
 778 ;
 779 ; GFX1013-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign:
 780 ; GFX1013:       ; %bb.0:
 781 ; GFX1013-NEXT:    s_clause 0x1
 782 ; GFX1013-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 783 ; GFX1013-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
 784 ; GFX1013-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 785 ; GFX1013-NEXT:    s_movk_i32 s1, 0x4400
 786 ; GFX1013-NEXT:    s_movk_i32 s9, 0x4600
 787 ; GFX1013-NEXT:    s_bfe_u32 s1, s1, 0x100000
 788 ; GFX1013-NEXT:    s_movk_i32 s0, 0x4500
 789 ; GFX1013-NEXT:    s_lshl_b32 s1, s1, 16
 790 ; GFX1013-NEXT:    s_movk_i32 s8, 0x4700
 791 ; GFX1013-NEXT:    s_bfe_u32 s0, s0, 0x100000
 792 ; GFX1013-NEXT:    s_bfe_u32 s8, s8, 0x100000
 793 ; GFX1013-NEXT:    v_mov_b32_e32 v3, 0
 794 ; GFX1013-NEXT:    v_mov_b32_e32 v4, 1.0
 795 ; GFX1013-NEXT:    v_mov_b32_e32 v5, 2.0
 796 ; GFX1013-NEXT:    s_waitcnt lgkmcnt(0)
 797 ; GFX1013-NEXT:    v_mov_b32_e32 v0, s2
 798 ; GFX1013-NEXT:    v_mov_b32_e32 v1, s3
 799 ; GFX1013-NEXT:    s_movk_i32 s2, 0x4200
 800 ; GFX1013-NEXT:    s_movk_i32 s3, 0x4800
 801 ; GFX1013-NEXT:    s_bfe_u32 s2, s2, 0x100000
 802 ; GFX1013-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
 803 ; GFX1013-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 804 ; GFX1013-NEXT:    s_or_b32 s1, s2, s1
 805 ; GFX1013-NEXT:    s_bfe_u32 s2, s9, 0x100000
 806 ; GFX1013-NEXT:    s_bfe_u32 s3, s3, 0x100000
 807 ; GFX1013-NEXT:    flat_load_dword v2, v[0:1]
 808 ; GFX1013-NEXT:    s_lshl_b32 s2, s2, 16
 809 ; GFX1013-NEXT:    s_lshl_b32 s3, s3, 16
 810 ; GFX1013-NEXT:    s_or_b32 s0, s0, s2
 811 ; GFX1013-NEXT:    s_or_b32 s2, s8, s3
 812 ; GFX1013-NEXT:    v_mov_b32_e32 v0, 0xb36211c6
 813 ; GFX1013-NEXT:    v_mov_b32_e32 v1, 0x102
 814 ; GFX1013-NEXT:    v_mov_b32_e32 v6, s1
 815 ; GFX1013-NEXT:    v_mov_b32_e32 v7, s0
 816 ; GFX1013-NEXT:    v_mov_b32_e32 v8, s2
 817 ; GFX1013-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 818 ; GFX1013-NEXT:    image_bvh64_intersect_ray v[0:3], v[0:15], s[4:7] a16
 819 ; GFX1013-NEXT:    s_waitcnt vmcnt(0)
 820 ; GFX1013-NEXT:    flat_store_dwordx4 v[0:1], v[0:3]
 821 ; GFX1013-NEXT:    s_endpgm
 822   %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
 823   %gep_ray = getelementptr inbounds float, float* %p_ray, i32 %lid
 824   %ray_extent = load float, float* %gep_ray, align 4
 825   %ray_origin0 = insertelement <3 x float> undef, float 0.0, i32 0
 826   %ray_origin1 = insertelement <3 x float> %ray_origin0, float 1.0, i32 1
 827   %ray_origin = insertelement <3 x float> %ray_origin1, float 2.0, i32 2
 828   %ray_dir0 = insertelement <3 x half> undef, half 3.0, i32 0
 829   %ray_dir1 = insertelement <3 x half> %ray_dir0, half 4.0, i32 1
 830   %ray_dir = insertelement <3 x half> %ray_dir1, half 5.0, i32 2
 831   %ray_inv_dir0 = insertelement <3 x half> undef, half 6.0, i32 0
 832   %ray_inv_dir1 = insertelement <3 x half> %ray_inv_dir0, half 7.0, i32 1
 833   %ray_inv_dir = insertelement <3 x half> %ray_inv_dir1, half 8.0, i32 2
 834   %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64 1111111111110, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr)
 835   store <4 x i32> %v, <4 x i32>* undef
 836   ret void
 837 }