test/CodeGen/AMDGPU/smrd.ll

   1 ; RUN: llc -march=amdgcn -mcpu=tahiti  -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=SI   -check-prefix=GCN -check-prefix=SICIVI -check-prefix=SICI -check-prefix=SIVIGFX9_10 %s
   2 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=CI   -check-prefix=GCN -check-prefix=SICIVI -check-prefix=SICI %s
   3 ; RUN: llc -march=amdgcn -mcpu=tonga   -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=VI   -check-prefix=GCN -check-prefix=SICIVI -check-prefix=VIGFX9_10 -check-prefix=SIVIGFX9_10 %s
   4 ; RUN: llc -march=amdgcn -mcpu=gfx900  -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN -check-prefix=VIGFX9_10 -check-prefix=SIVIGFX9_10  %s
   5 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=GFX10 -check-prefix=GCN -check-prefix=VIGFX9_10 -check-prefix=SIVIGFX9_10  %s
   6
   7 ; SMRD load with an immediate offset.
   8 ; GCN-LABEL: {{^}}smrd0:
   9 ; SICI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x1 ; encoding: [0x01
  10 ; VIGFX9_10: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4
  11 define amdgpu_kernel void @smrd0(i32 addrspace(1)* %out, i32 addrspace(4)* %ptr) #0 {
  12 entry:
  13   %tmp = getelementptr i32, i32 addrspace(4)* %ptr, i64 1
  14   %tmp1 = load i32, i32 addrspace(4)* %tmp
  15   store i32 %tmp1, i32 addrspace(1)* %out
  16   ret void
  17 }
  18
  19 ; SMRD load with the largest possible immediate offset.
  20 ; GCN-LABEL: {{^}}smrd1:
  21 ; SICI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff,0x{{[0-9]+[137]}}
  22 ; VIGFX9_10: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc
  23 define amdgpu_kernel void @smrd1(i32 addrspace(1)* %out, i32 addrspace(4)* %ptr) #0 {
  24 entry:
  25   %tmp = getelementptr i32, i32 addrspace(4)* %ptr, i64 255
  26   %tmp1 = load i32, i32 addrspace(4)* %tmp
  27   store i32 %tmp1, i32 addrspace(1)* %out
  28   ret void
  29 }
  30
  31 ; SMRD load with an offset greater than the largest possible immediate.
  32 ; GCN-LABEL: {{^}}smrd2:
  33 ; SI: s_movk_i32 s[[OFFSET:[0-9]]], 0x400
  34 ; SI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], s[[OFFSET]] ; encoding: [0x0[[OFFSET]]
  35 ; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x100
  36 ; VIGFX9_10: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400
  37 ; GCN: s_endpgm
  38 define amdgpu_kernel void @smrd2(i32 addrspace(1)* %out, i32 addrspace(4)* %ptr) #0 {
  39 entry:
  40   %tmp = getelementptr i32, i32 addrspace(4)* %ptr, i64 256
  41   %tmp1 = load i32, i32 addrspace(4)* %tmp
  42   store i32 %tmp1, i32 addrspace(1)* %out
  43   ret void
  44 }
  45
  46 ; SMRD load with a 64-bit offset
  47 ; GCN-LABEL: {{^}}smrd3:
  48 ; FIXME: There are too many copies here because we don't fold immediates
  49 ;        through REG_SEQUENCE
  50 ; SI: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[{{[0-9]:[0-9]}}], 0x13 ; encoding: [0x13
  51 ; TODO: Add VI checks
  52 ; GCN: s_endpgm
  53 define amdgpu_kernel void @smrd3(i32 addrspace(1)* %out, [8 x i32], i32 addrspace(4)* %ptr) #0 {
  54 entry:
  55   %tmp = getelementptr i32, i32 addrspace(4)* %ptr, i64 4294967296
  56   %tmp1 = load i32, i32 addrspace(4)* %tmp
  57   store i32 %tmp1, i32 addrspace(1)* %out
  58   ret void
  59 }
  60
  61 ; SMRD load with the largest possible immediate offset on VI
  62 ; GCN-LABEL: {{^}}smrd4:
  63 ; SI: s_mov_b32 [[OFFSET:s[0-9]+]], 0xffffc
  64 ; SI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
  65 ; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3ffff
  66 ; VIGFX9_10: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xffffc
  67 define amdgpu_kernel void @smrd4(i32 addrspace(1)* %out, i32 addrspace(4)* %ptr) #0 {
  68 entry:
  69   %tmp = getelementptr i32, i32 addrspace(4)* %ptr, i64 262143
  70   %tmp1 = load i32, i32 addrspace(4)* %tmp
  71   store i32 %tmp1, i32 addrspace(1)* %out
  72   ret void
  73 }
  74
  75 ; SMRD load with an offset greater than the largest possible immediate on VI
  76 ; GCN-LABEL: {{^}}smrd5:
  77 ; SIVIGFX9_10: s_mov_b32 [[OFFSET:s[0-9]+]], 0x100000
  78 ; SIVIGFX9_10: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
  79 ; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x40000
  80 ; GCN: s_endpgm
  81 define amdgpu_kernel void @smrd5(i32 addrspace(1)* %out, i32 addrspace(4)* %ptr) #0 {
  82 entry:
  83   %tmp = getelementptr i32, i32 addrspace(4)* %ptr, i64 262144
  84   %tmp1 = load i32, i32 addrspace(4)* %tmp
  85   store i32 %tmp1, i32 addrspace(1)* %out
  86   ret void
  87 }
  88
  89 ; GCN-LABEL: {{^}}smrd_hazard:
  90 ; GCN-DAG: s_mov_b32 s3, 3
  91 ; GCN-DAG: s_mov_b32 s2, 2
  92 ; GCN-DAG: s_mov_b32 s1, 1
  93 ; GCN-DAG: s_mov_b32 s0, 0
  94 ; SI-NEXT: nop 3
  95 ; GFX10-NEXT: ; implicit-def: $vcc_hi
  96 ; GCN-NEXT: s_buffer_load_dword s0, s[0:3], 0x0
  97 define amdgpu_ps float @smrd_hazard(<4 x i32> inreg %desc) #0 {
  98 main_body:
  99   %d0 = insertelement <4 x i32> undef, i32 0, i32 0
 100   %d1 = insertelement <4 x i32> %d0, i32 1, i32 1
 101   %d2 = insertelement <4 x i32> %d1, i32 2, i32 2
 102   %d3 = insertelement <4 x i32> %d2, i32 3, i32 3
 103   %r = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %d3, i32 0, i32 0)
 104   ret float %r
 105 }
 106
 107 ; SMRD load using the load.const.v4i32 intrinsic with an immediate offset
 108 ; GCN-LABEL: {{^}}smrd_load_const0:
 109 ; SICI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4 ; encoding: [0x04
 110 ; VIGFX9_10: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x10
 111 define amdgpu_ps void @smrd_load_const0(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, <4 x i32> addrspace(4)* inreg %in) #0 {
 112 main_body:
 113   %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg, i32 0
 114   %tmp20 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp
 115   %tmp21 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 16, i32 0)
 116   call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %tmp21, i1 true, i1 true) #0
 117   ret void
 118 }
 119
 120 ; SMRD load using the load.const.v4i32 intrinsic with the largest possible immediate
 121 ; offset.
 122 ; GCN-LABEL: {{^}}smrd_load_const1:
 123 ; SICI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff
 124 ; SICI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff glc ; encoding: [0xff
 125 ; VIGFX9_10-DAG: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc ;
 126 ; VIGFX9_10-DAG: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc glc ;
 127 define amdgpu_ps void @smrd_load_const1(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, <4 x i32> addrspace(4)* inreg %in) #0 {
 128 main_body:
 129   %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg, i32 0
 130   %tmp20 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp
 131   %tmp21 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 1020, i32 0)
 132   %tmp22 = load <4 x i32>, <4 x i32> addrspace(4)* %in
 133   %s.buffer = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %tmp22, i32 1020, i32 1)
 134   %s.buffer.float = bitcast i32 %s.buffer to float
 135   call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %s.buffer.float, i1 true, i1 true) #0
 136   ret void
 137 }
 138
 139 ; SMRD load using the load.const.v4i32 intrinsic with an offset greater than the
 140 ; largets possible immediate.
 141 ; immediate offset.
 142 ; GCN-LABEL: {{^}}smrd_load_const2:
 143 ; SI: s_movk_i32 s[[OFFSET:[0-9]]], 0x400
 144 ; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], s[[OFFSET]] ; encoding: [0x0[[OFFSET]]
 145 ; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], s[[OFFSET]] ; encoding: [0x0[[OFFSET]]
 146 ; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x100
 147 ; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x100
 148 ; VIGFX9_10: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400
 149 ; VIGFX9_10: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400
 150 define amdgpu_ps void @smrd_load_const2(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, <4 x i32> addrspace(4)* inreg %in) #0 {
 151 main_body:
 152   %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg, i32 0
 153   %tmp20 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp
 154   %tmp21 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 1024, i32 0)
 155   %tmp22 = load <4 x i32>, <4 x i32> addrspace(4)* %in
 156   %s.buffer = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %tmp22, i32 1024, i32 0)
 157   %s.buffer.float = bitcast i32 %s.buffer to float
 158   call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %s.buffer.float, i1 true, i1 true) #0
 159   ret void
 160 }
 161
 162 ; SMRD load with the largest possible immediate offset on VI
 163 ; GCN-LABEL: {{^}}smrd_load_const3:
 164 ; SI: s_mov_b32 [[OFFSET:s[0-9]+]], 0xffffc
 165 ; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
 166 ; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
 167 ; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3ffff
 168 ; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3ffff
 169 ; VIGFX9_10: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xffffc
 170 ; VIGFX9_10: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xffffc
 171 define amdgpu_ps void @smrd_load_const3(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, <4 x i32> addrspace(4)* inreg %in) #0 {
 172 main_body:
 173   %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg, i32 0
 174   %tmp20 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp
 175   %tmp21 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 1048572, i32 0)
 176   %tmp22 = load <4 x i32>, <4 x i32> addrspace(4)* %in
 177   %s.buffer = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %tmp22, i32 1048572, i32 0)
 178   %s.buffer.float = bitcast i32 %s.buffer to float
 179   call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %s.buffer.float, i1 true, i1 true) #0
 180   ret void
 181 }
 182
 183 ; SMRD load with an offset greater than the largest possible immediate on VI
 184 ; GCN-LABEL: {{^}}smrd_load_const4:
 185 ; SIVIGFX9_10: s_mov_b32 [[OFFSET:s[0-9]+]], 0x100000
 186 ; SIVIGFX9_10: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
 187 ; SIVIGFX9_10: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
 188 ; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x40000
 189 ; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x40000
 190 ; GCN: s_endpgm
 191 define amdgpu_ps void @smrd_load_const4(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, <4 x i32> addrspace(4)* inreg %in) #0 {
 192 main_body:
 193   %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg, i32 0
 194   %tmp20 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp
 195   %tmp21 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 1048576, i32 0)
 196   %tmp22 = load <4 x i32>, <4 x i32> addrspace(4)* %in
 197   %s.buffer = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %tmp22, i32 1048576, i32 0)
 198   %s.buffer.float = bitcast i32 %s.buffer to float
 199   call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %s.buffer.float, i1 true, i1 true) #0
 200   ret void
 201 }
 202
 203 ; dwordx2 s.buffer.load
 204 ; GCN-LABEL: {{^}}s_buffer_load_dwordx2:
 205 ; VIGFX9_10: s_buffer_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]:[0-9]}}], 0x80
 206 ; SICI: s_buffer_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]:[0-9]}}], 0x20
 207 define amdgpu_ps void @s_buffer_load_dwordx2(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, <4 x i32> addrspace(4)* inreg %in) #0 {
 208 main_body:
 209   %tmp22 = load <4 x i32>, <4 x i32> addrspace(4)* %in
 210   %s.buffer = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %tmp22, i32 128, i32 0)
 211   %s.buffer.0 = extractelement <2 x i32> %s.buffer, i32 0
 212   %s.buffer.0.float = bitcast i32 %s.buffer.0 to float
 213   %s.buffer.1 = extractelement <2 x i32> %s.buffer, i32 1
 214   %s.buffer.1.float = bitcast i32 %s.buffer.1 to float
 215   call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %s.buffer.0.float, float %s.buffer.1.float, float %s.buffer.0.float, float %s.buffer.1.float, i1 true, i1 true) #0
 216   ret void
 217 }
 218
 219 ; dwordx4 s.buffer.load
 220 ; GCN-LABEL: {{^}}s_buffer_load_dwordx4:
 221 ; VIGFX9_10: s_buffer_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]:[0-9]}}], 0x80
 222 ; SICI: s_buffer_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]:[0-9]}}], 0x20
 223 define amdgpu_ps void @s_buffer_load_dwordx4(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, <4 x i32> addrspace(4)* inreg %in) #0 {
 224 main_body:
 225   %tmp22 = load <4 x i32>, <4 x i32> addrspace(4)* %in
 226   %s.buffer = call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32> %tmp22, i32 128, i32 0)
 227   %s.buffer.0 = extractelement <4 x i32> %s.buffer, i32 0
 228   %s.buffer.0.float = bitcast i32 %s.buffer.0 to float
 229   %s.buffer.1 = extractelement <4 x i32> %s.buffer, i32 1
 230   %s.buffer.1.float = bitcast i32 %s.buffer.1 to float
 231   %s.buffer.2 = extractelement <4 x i32> %s.buffer, i32 2
 232   %s.buffer.2.float = bitcast i32 %s.buffer.2 to float
 233   %s.buffer.3 = extractelement <4 x i32> %s.buffer, i32 3
 234   %s.buffer.3.float = bitcast i32 %s.buffer.3 to float
 235   call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %s.buffer.0.float, float %s.buffer.1.float, float %s.buffer.2.float, float %s.buffer.3.float, i1 true, i1 true) #0
 236   ret void
 237 }
 238
 239 ; dwordx8 s.buffer.load
 240 ; GCN-LABEL: {{^}}s_buffer_load_dwordx8:
 241 ; VIGFX9_10: s_buffer_load_dwordx8 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]:[0-9]}}], 0x80
 242 ; SICI: s_buffer_load_dwordx8 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]:[0-9]}}], 0x20
 243 define amdgpu_ps void @s_buffer_load_dwordx8(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, <4 x i32> addrspace(4)* inreg %in) #0 {
 244 main_body:
 245   %tmp22 = load <4 x i32>, <4 x i32> addrspace(4)* %in
 246   %s.buffer = call <8 x i32> @llvm.amdgcn.s.buffer.load.v8i32(<4 x i32> %tmp22, i32 128, i32 0)
 247   %s.buffer.0 = extractelement <8 x i32> %s.buffer, i32 0
 248   %s.buffer.0.float = bitcast i32 %s.buffer.0 to float
 249   %s.buffer.1 = extractelement <8 x i32> %s.buffer, i32 2
 250   %s.buffer.1.float = bitcast i32 %s.buffer.1 to float
 251   %s.buffer.2 = extractelement <8 x i32> %s.buffer, i32 5
 252   %s.buffer.2.float = bitcast i32 %s.buffer.2 to float
 253   %s.buffer.3 = extractelement <8 x i32> %s.buffer, i32 7
 254   %s.buffer.3.float = bitcast i32 %s.buffer.3 to float
 255   call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %s.buffer.0.float, float %s.buffer.1.float, float %s.buffer.2.float, float %s.buffer.3.float, i1 true, i1 true) #0
 256   ret void
 257 }
 258
 259 ; dwordx16 s.buffer.load
 260 ; GCN-LABEL: {{^}}s_buffer_load_dwordx16:
 261 ; VIGFX9_10: s_buffer_load_dwordx16 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]:[0-9]}}], 0x80
 262 ; SICI: s_buffer_load_dwordx16 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]:[0-9]}}], 0x20
 263 define amdgpu_ps void @s_buffer_load_dwordx16(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, <4 x i32> addrspace(4)* inreg %in) #0 {
 264 main_body:
 265   %tmp22 = load <4 x i32>, <4 x i32> addrspace(4)* %in
 266   %s.buffer = call <16 x i32> @llvm.amdgcn.s.buffer.load.v16i32(<4 x i32> %tmp22, i32 128, i32 0)
 267   %s.buffer.0 = extractelement <16 x i32> %s.buffer, i32 0
 268   %s.buffer.0.float = bitcast i32 %s.buffer.0 to float
 269   %s.buffer.1 = extractelement <16 x i32> %s.buffer, i32 3
 270   %s.buffer.1.float = bitcast i32 %s.buffer.1 to float
 271   %s.buffer.2 = extractelement <16 x i32> %s.buffer, i32 12
 272   %s.buffer.2.float = bitcast i32 %s.buffer.2 to float
 273   %s.buffer.3 = extractelement <16 x i32> %s.buffer, i32 15
 274   %s.buffer.3.float = bitcast i32 %s.buffer.3 to float
 275   call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %s.buffer.0.float, float %s.buffer.1.float, float %s.buffer.2.float, float %s.buffer.3.float, i1 true, i1 true) #0
 276   ret void
 277 }
 278
 279 ; GCN-LABEL: {{^}}smrd_sgpr_offset:
 280 ; GCN: s_buffer_load_dword s{{[0-9]}}, s[0:3], s4
 281 define amdgpu_ps float @smrd_sgpr_offset(<4 x i32> inreg %desc, i32 inreg %offset) #0 {
 282 main_body:
 283   %r = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 %offset, i32 0)
 284   ret float %r
 285 }
 286
 287 ; GCN-LABEL: {{^}}smrd_vgpr_offset:
 288 ; GCN: buffer_load_dword v{{[0-9]}}, v0, s[0:3], 0 offen ;
 289 define amdgpu_ps float @smrd_vgpr_offset(<4 x i32> inreg %desc, i32 %offset) #0 {
 290 main_body:
 291   %r = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 %offset, i32 0)
 292   ret float %r
 293 }
 294
 295 ; GCN-LABEL: {{^}}smrd_vgpr_offset_imm:
 296 ; GCN-NEXT: %bb.
 297 ; GCN-NEXT: buffer_load_dword v{{[0-9]}}, v0, s[0:3], 0 offen offset:4092 ;
 298 define amdgpu_ps float @smrd_vgpr_offset_imm(<4 x i32> inreg %desc, i32 %offset) #0 {
 299 main_body:
 300   %off = add i32 %offset, 4092
 301   %r = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 %off, i32 0)
 302   ret float %r
 303 }
 304
 305 ; GCN-LABEL: {{^}}smrd_vgpr_offset_imm_too_large:
 306 ; GCN-NEXT: %bb.
 307 ; SICI-NEXT: v_add_{{i|u}}32_e32 v0, {{(vcc, )?}}0x1000, v0
 308 ; SICI-NEXT: buffer_load_dword v{{[0-9]}}, v0, s[0:3], 0 offen ;
 309 ; VIGFX9_10-NEXT: buffer_load_dword v{{[0-9]}}, v0, s[0:3], 4 offen offset:4092 ;
 310 define amdgpu_ps float @smrd_vgpr_offset_imm_too_large(<4 x i32> inreg %desc, i32 %offset) #0 {
 311 main_body:
 312   %off = add i32 %offset, 4096
 313   %r = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 %off, i32 0)
 314   ret float %r
 315 }
 316
 317 ; GCN-LABEL: {{^}}smrd_imm_merged:
 318 ; GCN-NEXT: %bb.
 319 ; SICI-NEXT: s_buffer_load_dwordx4 s[{{[0-9]}}:{{[0-9]}}], s[0:3], 0x1
 320 ; SICI-NEXT: s_buffer_load_dwordx2 s[{{[0-9]}}:{{[0-9]}}], s[0:3], 0x7
 321 ; VIGFX9_10-NEXT: s_buffer_load_dwordx4 s[{{[0-9]}}:{{[0-9]}}], s[0:3], 0x4
 322 ; VIGFX9_10-NEXT: s_buffer_load_dwordx2 s[{{[0-9]}}:{{[0-9]}}], s[0:3], 0x1c
 323 define amdgpu_ps void @smrd_imm_merged(<4 x i32> inreg %desc) #0 {
 324 main_body:
 325   %r1 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 4, i32 0)
 326   %r2 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 8, i32 0)
 327   %r3 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 12, i32 0)
 328   %r4 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 16, i32 0)
 329   %r5 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 28, i32 0)
 330   %r6 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 32, i32 0)
 331   call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true) #0
 332   call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r5, float %r6, float undef, float undef, i1 true, i1 true) #0
 333   ret void
 334 }
 335
 336 ; GCN-LABEL: {{^}}smrd_imm_merge_m0:
 337 ;
 338 ; GCN: s_buffer_load_dwordx2
 339 ; SICIVI: s_mov_b32 m0
 340 ; SICIVI_DAG: v_interp_p1_f32
 341 ; SICIVI_DAG: v_interp_p1_f32
 342 ; SICIVI_DAG: v_interp_p1_f32
 343 ; SICIVI_DAG: v_interp_p2_f32
 344 ; SICIVI_DAG: v_interp_p2_f32
 345 ; SICIVI_DAG: v_interp_p2_f32
 346 ;
 347 ; extractelement does not result in movrels anymore for vectors gitting 8 dwords
 348 ; SICIVI-NOT: s_mov_b32 m0
 349 ; SICIVI-NOT: v_movrels_b32_e32
 350 ; v_cndmask_b32_e32
 351 ; v_cndmask_b32_e32
 352 ;
 353 ; Merging is still thwarted on GFX9 due to s_set_gpr_idx
 354 ;
 355 define amdgpu_ps float @smrd_imm_merge_m0(<4 x i32> inreg %desc, i32 inreg %prim, float %u, float %v) #0 {
 356 main_body:
 357   %idx1.f = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 0, i32 0)
 358   %idx1 = bitcast float %idx1.f to i32
 359
 360   %v0.x1 = call nsz float @llvm.amdgcn.interp.p1(float %u, i32 0, i32 0, i32 %prim)
 361   %v0.x = call nsz float @llvm.amdgcn.interp.p2(float %v0.x1, float %v, i32 0, i32 0, i32 %prim)
 362   %v0.y1 = call nsz float @llvm.amdgcn.interp.p1(float %u, i32 0, i32 1, i32 %prim)
 363   %v0.y = call nsz float @llvm.amdgcn.interp.p2(float %v0.y1, float %v, i32 0, i32 1, i32 %prim)
 364   %v0.z1 = call nsz float @llvm.amdgcn.interp.p1(float %u, i32 0, i32 2, i32 %prim)
 365   %v0.z = call nsz float @llvm.amdgcn.interp.p2(float %v0.z1, float %v, i32 0, i32 2, i32 %prim)
 366   %v0.tmp0 = insertelement <3 x float> undef, float %v0.x, i32 0
 367   %v0.tmp1 = insertelement <3 x float> %v0.tmp0, float %v0.y, i32 1
 368   %v0 = insertelement <3 x float> %v0.tmp1, float %v0.z, i32 2
 369   %a = extractelement <3 x float> %v0, i32 %idx1
 370
 371   %v1.x1 = call nsz float @llvm.amdgcn.interp.p1(float %u, i32 1, i32 0, i32 %prim)
 372   %v1.x = call nsz float @llvm.amdgcn.interp.p2(float %v1.x1, float %v, i32 1, i32 0, i32 %prim)
 373   %v1.y1 = call nsz float @llvm.amdgcn.interp.p1(float %u, i32 1, i32 1, i32 %prim)
 374   %v1.y = call nsz float @llvm.amdgcn.interp.p2(float %v1.y1, float %v, i32 1, i32 1, i32 %prim)
 375   %v1.z1 = call nsz float @llvm.amdgcn.interp.p1(float %u, i32 1, i32 2, i32 %prim)
 376   %v1.z = call nsz float @llvm.amdgcn.interp.p2(float %v1.z1, float %v, i32 1, i32 2, i32 %prim)
 377   %v1.tmp0 = insertelement <3 x float> undef, float %v0.x, i32 0
 378   %v1.tmp1 = insertelement <3 x float> %v0.tmp0, float %v0.y, i32 1
 379   %v1 = insertelement <3 x float> %v0.tmp1, float %v0.z, i32 2
 380
 381   %b = extractelement <3 x float> %v1, i32 %idx1
 382   %c = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 4, i32 0)
 383
 384   %res.tmp = fadd float %a, %b
 385   %res = fadd float %res.tmp, %c
 386   ret float %res
 387 }
 388
 389 ; GCN-LABEL: {{^}}smrd_vgpr_merged:
 390 ; GCN-NEXT: %bb.
 391 ; GCN-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4
 392 ; GCN-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28
 393 define amdgpu_ps void @smrd_vgpr_merged(<4 x i32> inreg %desc, i32 %a) #0 {
 394 main_body:
 395   %a1 = add i32 %a, 4
 396   %a2 = add i32 %a, 8
 397   %a3 = add i32 %a, 12
 398   %a4 = add i32 %a, 16
 399   %a5 = add i32 %a, 28
 400   %a6 = add i32 %a, 32
 401   %r1 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 %a1, i32 0)
 402   %r2 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 %a2, i32 0)
 403   %r3 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 %a3, i32 0)
 404   %r4 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 %a4, i32 0)
 405   %r5 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 %a5, i32 0)
 406   %r6 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 %a6, i32 0)
 407   call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true) #0
 408   call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r5, float %r6, float undef, float undef, i1 true, i1 true) #0
 409   ret void
 410 }
 411
 412 ; GCN-LABEL: {{^}}smrd_sgpr_descriptor_promoted
 413 ; GCN: v_readfirstlane
 414 define amdgpu_cs void @smrd_sgpr_descriptor_promoted([0 x i8] addrspace(4)* inreg noalias dereferenceable(18446744073709551615), i32) #0 {
 415 main_body:
 416   %descptr = bitcast [0 x i8] addrspace(4)* %0 to <4 x i32> addrspace(4)*, !amdgpu.uniform !0
 417   br label %.outer_loop_header
 418
 419 ret_block:                                       ; preds = %.outer, %.label22, %main_body
 420   ret void
 421
 422 .outer_loop_header:
 423   br label %.inner_loop_header
 424
 425 .inner_loop_header:                                     ; preds = %.inner_loop_body, %.outer_loop_header
 426   %loopctr.1 = phi i32 [ 0, %.outer_loop_header ], [ %loopctr.2, %.inner_loop_body ]
 427   %loopctr.2 = add i32 %loopctr.1, 1
 428   %inner_br1 = icmp slt i32 %loopctr.2, 10
 429   br i1 %inner_br1, label %.inner_loop_body, label %ret_block
 430
 431 .inner_loop_body:
 432   %descriptor = load <4 x i32>, <4 x i32> addrspace(4)* %descptr, align 16, !invariant.load !0
 433   %load1result = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %descriptor, i32 0, i32 0)
 434   store float %load1result, float addrspace(1)* undef
 435   %inner_br2 = icmp uge i32 %1, 10
 436   br i1 %inner_br2, label %.inner_loop_header, label %.outer_loop_body
 437
 438 .outer_loop_body:
 439   %offset = shl i32 %loopctr.2, 6
 440   %load2result = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %descriptor, i32 %offset, i32 0)
 441   %outer_br = fcmp ueq float %load2result, 0x0
 442   br i1 %outer_br, label %.outer_loop_header, label %ret_block
 443 }
 444
 445 ; SMRD load with a non-const offset
 446 ; GCN-LABEL: {{^}}smrd_load_nonconst0:
 447 ; SIVIGFX9_10: s_buffer_load_dword s{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}}
 448 ; SIVIGFX9_10: s_buffer_load_dword s{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}}
 449 ; CI: s_buffer_load_dword s{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}}
 450 ; CI: s_buffer_load_dword s{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}}
 451 ; GCN: s_endpgm
 452 define amdgpu_ps void @smrd_load_nonconst0(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, <4 x i32> addrspace(4)* inreg %in, i32 inreg %ncoff) #0 {
 453 main_body:
 454   %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg, i32 0
 455   %tmp20 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp
 456   %tmp21 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 %ncoff, i32 0)
 457   %tmp22 = load <4 x i32>, <4 x i32> addrspace(4)* %in
 458   %s.buffer = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %tmp22, i32 %ncoff, i32 0)
 459   %s.buffer.float = bitcast i32 %s.buffer to float
 460   call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %s.buffer.float, i1 true, i1 true) #0
 461   ret void
 462 }
 463
 464 ; SMRD load with a non-const non-uniform offset
 465 ; GCN-LABEL: {{^}}smrd_load_nonconst1:
 466 ; SIVIGFX9_10: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen
 467 ; SIVIGFX9_10: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen
 468 ; CI: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen
 469 ; CI: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen
 470 ; GCN: s_endpgm
 471 define amdgpu_ps void @smrd_load_nonconst1(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, <4 x i32> addrspace(4)* inreg %in, i32 %ncoff) #0 {
 472 main_body:
 473   %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg, i32 0
 474   %tmp20 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp
 475   %tmp21 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 %ncoff, i32 0)
 476   %tmp22 = load <4 x i32>, <4 x i32> addrspace(4)* %in
 477   %s.buffer = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %tmp22, i32 %ncoff, i32 0)
 478   %s.buffer.float = bitcast i32 %s.buffer to float
 479   call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %s.buffer.float, i1 true, i1 true) #0
 480   ret void
 481 }
 482
 483 ; SMRD load with a non-const non-uniform offset of > 4 dwords (requires splitting)
 484 ; GCN-LABEL: {{^}}smrd_load_nonconst2:
 485 ; SIVIGFX9_10: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen
 486 ; SIVIGFX9_10: buffer_load_dwordx4 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen
 487 ; CI: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen
 488 ; CI: buffer_load_dwordx4 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen
 489 ; GCN: s_endpgm
 490 define amdgpu_ps void @smrd_load_nonconst2(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, <4 x i32> addrspace(4)* inreg %in, i32 %ncoff) #0 {
 491 main_body:
 492   %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg, i32 0
 493   %tmp20 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp
 494   %tmp21 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 %ncoff, i32 0)
 495   %tmp22 = load <4 x i32>, <4 x i32> addrspace(4)* %in
 496   %s.buffer = call <8 x i32> @llvm.amdgcn.s.buffer.load.v8i32(<4 x i32> %tmp22, i32 %ncoff, i32 0)
 497   %s.buffer.elt = extractelement <8 x i32> %s.buffer, i32 1
 498   %s.buffer.float = bitcast i32 %s.buffer.elt to float
 499   call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %s.buffer.float, i1 true, i1 true) #0
 500   ret void
 501 }
 502
 503 ; SMRD load with a non-const non-uniform offset of > 4 dwords (requires splitting)
 504 ; GCN-LABEL: {{^}}smrd_load_nonconst3:
 505 ; GCN-DAG: buffer_load_dwordx4 v[0:3], v{{[0-9]+}}, s[0:3], 0 offen ;
 506 ; GCN-DAG: buffer_load_dwordx4 v[4:7], v{{[0-9]+}}, s[0:3], 0 offen offset:16 ;
 507 ; GCN-DAG: buffer_load_dwordx4 v[8:11], v{{[0-9]+}}, s[0:3], 0 offen offset:32 ;
 508 ; GCN-DAG: buffer_load_dwordx4 v[12:15], v{{[0-9]+}}, s[0:3], 0 offen offset:48 ;
 509 ; GCN: ; return to shader part epilog
 510 define amdgpu_ps <16 x float> @smrd_load_nonconst3(<4 x i32> inreg %rsrc, i32 %off) #0 {
 511 main_body:
 512   %ld = call <16 x i32> @llvm.amdgcn.s.buffer.load.v16i32(<4 x i32> %rsrc, i32 %off, i32 0)
 513   %bc = bitcast <16 x i32> %ld to <16 x float>
 514   ret <16 x float> %bc
 515 }
 516
 517 ; GCN-LABEL: {{^}}smrd_load_nonconst4:
 518 ; SICI: v_add_i32_e32 v{{[0-9]+}}, vcc, 0xff8, v0 ;
 519 ; SICI-DAG: buffer_load_dwordx4 v[0:3], v{{[0-9]+}}, s[0:3], 0 offen ;
 520 ; SICI-DAG: buffer_load_dwordx4 v[4:7], v{{[0-9]+}}, s[0:3], 0 offen offset:16 ;
 521 ; SICI-DAG: buffer_load_dwordx4 v[8:11], v{{[0-9]+}}, s[0:3], 0 offen offset:32 ;
 522 ; SICI-DAG: buffer_load_dwordx4 v[12:15], v{{[0-9]+}}, s[0:3], 0 offen offset:48 ;
 523 ; VIGFX9_10-DAG: buffer_load_dwordx4 v[0:3], v{{[0-9]+}}, s[0:3], 56 offen offset:4032 ;
 524 ; VIGFX9_10-DAG: buffer_load_dwordx4 v[4:7], v{{[0-9]+}}, s[0:3], 56 offen offset:4048 ;
 525 ; VIGFX9_10-DAG: buffer_load_dwordx4 v[8:11], v{{[0-9]+}}, s[0:3], 56 offen offset:4064 ;
 526 ; VIGFX9_10-DAG: buffer_load_dwordx4 v[12:15], v{{[0-9]+}}, s[0:3], 56 offen offset:4080 ;
 527 ; GCN: ; return to shader part epilog
 528 define amdgpu_ps <16 x float> @smrd_load_nonconst4(<4 x i32> inreg %rsrc, i32 %off) #0 {
 529 main_body:
 530   %off.2 = add i32 %off, 4088
 531   %ld = call <16 x i32> @llvm.amdgcn.s.buffer.load.v16i32(<4 x i32> %rsrc, i32 %off.2, i32 0)
 532   %bc = bitcast <16 x i32> %ld to <16 x float>
 533   ret <16 x float> %bc
 534 }
 535
 536 ; GCN-LABEL: {{^}}smrd_load_nonconst5:
 537 ; SICI: v_add_i32_e32 v{{[0-9]+}}, vcc, 0x1004, v0
 538 ; SICI-DAG: buffer_load_dwordx4 v[0:3], v{{[0-9]+}}, s[0:3], 0 offen ;
 539 ; SICI-DAG: buffer_load_dwordx4 v[4:7], v{{[0-9]+}}, s[0:3], 0 offen offset:16 ;
 540 ; SICI-DAG: buffer_load_dwordx4 v[8:11], v{{[0-9]+}}, s[0:3], 0 offen offset:32 ;
 541 ; SICI-DAG: buffer_load_dwordx4 v[12:15], v{{[0-9]+}}, s[0:3], 0 offen offset:48 ;
 542 ; VIGFX9_10: s_movk_i32 s4, 0xfc0
 543 ; VIGFX9_10-DAG: buffer_load_dwordx4 v[0:3], v{{[0-9]+}}, s[0:3], s4 offen offset:68 ;
 544 ; VIGFX9_10-DAG: buffer_load_dwordx4 v[4:7], v{{[0-9]+}}, s[0:3], s4 offen offset:84 ;
 545 ; VIGFX9_10-DAG: buffer_load_dwordx4 v[8:11], v{{[0-9]+}}, s[0:3], s4 offen offset:100 ;
 546 ; VIGFX9_10-DAG: buffer_load_dwordx4 v[12:15], v{{[0-9]+}}, s[0:3], s4 offen offset:116 ;
 547 ; GCN: ; return to shader part epilog
 548 define amdgpu_ps <16 x float> @smrd_load_nonconst5(<4 x i32> inreg %rsrc, i32 %off) #0 {
 549 main_body:
 550   %off.2 = add i32 %off, 4100
 551   %ld = call <16 x i32> @llvm.amdgcn.s.buffer.load.v16i32(<4 x i32> %rsrc, i32 %off.2, i32 0)
 552   %bc = bitcast <16 x i32> %ld to <16 x float>
 553   ret <16 x float> %bc
 554 }
 555
 556 ; SMRD load dwordx2
 557 ; GCN-LABEL: {{^}}smrd_load_dwordx2:
 558 ; SIVIGFX9_10: s_buffer_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}}
 559 ; CI: s_buffer_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}}
 560 ; GCN: s_endpgm
 561 define amdgpu_ps void @smrd_load_dwordx2(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, <4 x i32> addrspace(4)* inreg %in, i32 inreg %ncoff) #0 {
 562 main_body:
 563   %tmp22 = load <4 x i32>, <4 x i32> addrspace(4)* %in
 564   %s.buffer = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %tmp22, i32 %ncoff, i32 0)
 565   %s.buffer.float = bitcast <2 x i32> %s.buffer to <2 x float>
 566   %r.1 = extractelement <2 x float> %s.buffer.float, i32 0
 567   %r.2 = extractelement <2 x float> %s.buffer.float, i32 1
 568   call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r.1, float %r.1, float %r.1, float %r.2, i1 true, i1 true) #0
 569   ret void
 570 }
 571
 572 ; GCN-LABEL: {{^}}smrd_uniform_loop:
 573 ;
 574 ; TODO: we should keep the loop counter in an SGPR
 575 ;
 576 ; GCN: s_buffer_load_dword
 577 define amdgpu_ps float @smrd_uniform_loop(<4 x i32> inreg %desc, i32 %bound) #0 {
 578 main_body:
 579   br label %loop
 580
 581 loop:
 582   %counter = phi i32 [ 0, %main_body ], [ %counter.next, %loop ]
 583   %sum = phi float [ 0.0, %main_body ], [ %sum.next, %loop ]
 584   %offset = shl i32 %counter, 2
 585   %v = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 %offset, i32 0)
 586   %sum.next = fadd float %sum, %v
 587   %counter.next = add i32 %counter, 1
 588   %cc = icmp uge i32 %counter.next, %bound
 589   br i1 %cc, label %exit, label %loop
 590
 591 exit:
 592   ret float %sum.next
 593 }
 594
 595
 596 ; GCN-LABEL: {{^}}smrd_uniform_loop2:
 597 ; (this test differs from smrd_uniform_loop by the more complex structure of phis,
 598 ; which used to confuse the DivergenceAnalysis after structurization)
 599 ;
 600 ; TODO: we should keep the loop counter in an SGPR and use an S_BUFFER_LOAD
 601 ;
 602 ; GCN: buffer_load_dword
 603 define amdgpu_ps float @smrd_uniform_loop2(<4 x i32> inreg %desc, i32 %bound, i32 %bound.a) #0 {
 604 main_body:
 605   br label %loop
 606
 607 loop:
 608   %counter = phi i32 [ 0, %main_body ], [ %counter.next, %loop.a ], [ %counter.next, %loop.b ]
 609   %sum = phi float [ 0.0, %main_body ], [ %sum.next, %loop.a ], [ %sum.next.b, %loop.b ]
 610   %offset = shl i32 %counter, 2
 611   %v = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 %offset, i32 0)
 612   %sum.next = fadd float %sum, %v
 613   %counter.next = add i32 %counter, 1
 614   %cc = icmp uge i32 %counter.next, %bound
 615   br i1 %cc, label %exit, label %loop.a
 616
 617 loop.a:
 618   %cc.a = icmp uge i32 %counter.next, %bound.a
 619   br i1 %cc, label %loop, label %loop.b
 620
 621 loop.b:
 622   %sum.next.b = fadd float %sum.next, 1.0
 623   br label %loop
 624
 625 exit:
 626   ret float %sum.next
 627 }
 628
 629 ; This test checks that the load after some control flow with an offset based
 630 ; on a divergent shader input is correctly recognized as divergent. This was
 631 ; reduced from an actual regression. Yes, the %unused argument matters, as
 632 ; well as the fact that %arg4 is a vector.
 633 ;
 634 ; GCN-LABEL: {{^}}arg_divergence:
 635 ; GCN: buffer_load_dword v0, v0,
 636 ; GCN-NEXT: s_waitcnt
 637 ; GCN-NEXT: ; return to shader part epilog
 638 define amdgpu_cs float @arg_divergence(i32 inreg %unused, <3 x i32> %arg4) #0 {
 639 main_body:
 640   br i1 undef, label %if1, label %endif1
 641
 642 if1:                                              ; preds = %main_body
 643   store i32 0, i32 addrspace(3)* undef, align 4
 644   br label %endif1
 645
 646 endif1:                                           ; preds = %if1, %main_body
 647   %tmp13 = extractelement <3 x i32> %arg4, i32 0
 648   %tmp97 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 %tmp13, i32 0)
 649   ret float %tmp97
 650 }
 651
 652 ; GCN-LABEL: {{^}}s_buffer_load_f32:
 653 ; GCN: s_buffer_load_dword s0, s[0:3], s4
 654 define amdgpu_ps void @s_buffer_load_f32(<4 x i32> inreg %rsrc, i32 inreg %offset) {
 655   %sgpr = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %offset, i32 0)
 656   call void asm sideeffect "; use $0", "s"(float %sgpr)
 657   ret void
 658 }
 659
 660 ; GCN-LABEL: {{^}}s_buffer_load_v2f32:
 661 ; GCN: s_buffer_load_dwordx2 s[0:1], s[0:3], s4
 662 define amdgpu_ps void @s_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 inreg %offset) {
 663   %sgpr = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> %rsrc, i32 %offset, i32 0)
 664   call void asm sideeffect "; use $0", "s"(<2 x float> %sgpr)
 665   ret void
 666 }
 667
 668 ; GCN-LABEL: {{^}}s_buffer_load_v4f32:
 669 ; GCN: s_buffer_load_dwordx4 s[0:3], s[0:3], s4
 670 define amdgpu_ps void @s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 inreg %offset) {
 671   %sgpr = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %offset, i32 0)
 672   call void asm sideeffect "; use $0", "s"(<4 x float> %sgpr)
 673   ret void
 674 }
 675
 676 ; GCN-LABEL: {{^}}s_buffer_load_v8f32:
 677 ; GCN: s_buffer_load_dwordx8 s[0:7], s[0:3], s4
 678 define amdgpu_ps void @s_buffer_load_v8f32(<4 x i32> inreg %rsrc, i32 inreg %offset) {
 679   %sgpr = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %offset, i32 0)
 680   call void asm sideeffect "; use $0", "s"(<8 x float> %sgpr)
 681   ret void
 682 }
 683
 684 ; GCN-LABEL: {{^}}s_buffer_load_v16f32:
 685 ; GCN: s_buffer_load_dwordx16 s[0:15], s[0:3], s4
 686 define amdgpu_ps void @s_buffer_load_v16f32(<4 x i32> inreg %rsrc, i32 inreg %offset) {
 687   %sgpr = call <16 x float> @llvm.amdgcn.s.buffer.load.v16f32(<4 x i32> %rsrc, i32 %offset, i32 0)
 688   call void asm sideeffect "; use $0", "s"(<16 x float> %sgpr)
 689   ret void
 690 }
 691
 692 declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
 693 declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #2
 694 declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #2
 695
 696 declare i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32>, i32, i32) #1
 697 declare <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32>, i32, i32)
 698 declare <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32>, i32, i32)
 699 declare <8 x i32> @llvm.amdgcn.s.buffer.load.v8i32(<4 x i32>, i32, i32)
 700 declare <16 x i32> @llvm.amdgcn.s.buffer.load.v16i32(<4 x i32>, i32, i32)
 701
 702 declare float @llvm.amdgcn.s.buffer.load.f32(<4 x i32>, i32, i32)
 703 declare <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32>, i32, i32)
 704 declare <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32>, i32, i32)
 705 declare <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32>, i32, i32)
 706 declare <16 x float> @llvm.amdgcn.s.buffer.load.v16f32(<4 x i32>, i32, i32)
 707
 708 attributes #0 = { nounwind }
 709 attributes #1 = { nounwind readnone }
 710 attributes #2 = { nounwind readnone speculatable }
 711
 712 !0 = !{}