llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll

   1 ; RUN: opt -S -O1 -mtriple=amdgcn-- -amdgpu-simplify-libcall < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GCN-POSTLINK %s
   2 ; RUN: opt -S -O1 -mtriple=amdgcn-- -amdgpu-simplify-libcall -amdgpu-prelink -amdgpu-enable-ocl-mangling-mismatch-workaround=0 <%s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GCN-PRELINK %s
   3 ; RUN: opt -S -O1 -mtriple=amdgcn-- -amdgpu-use-native -amdgpu-prelink < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GCN-NATIVE %s
   4 ; RUN: opt -S -passes='default<O1>' -mtriple=amdgcn-- -amdgpu-simplify-libcall < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GCN-POSTLINK %s
   5 ; RUN: opt -S -passes='default<O1>' -mtriple=amdgcn-- -amdgpu-simplify-libcall -amdgpu-prelink -amdgpu-enable-ocl-mangling-mismatch-workaround=0 <%s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GCN-PRELINK %s
   6 ; RUN: opt -S -passes='default<O1>' -mtriple=amdgcn-- -amdgpu-use-native -amdgpu-prelink < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GCN-NATIVE %s
   7
   8 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_sincos
   9 ; GCN-POSTLINK: call fast float @_Z3sinf(
  10 ; GCN-POSTLINK: call fast float @_Z3cosf(
  11 ; GCN-PRELINK: call fast float @_Z6sincosfPU3AS5f(
  12 ; GCN-NATIVE: call fast float @_Z10native_sinf(
  13 ; GCN-NATIVE: call fast float @_Z10native_cosf(
  14 define amdgpu_kernel void @test_sincos(ptr addrspace(1) nocapture %a) {
  15 entry:
  16   %tmp = load float, ptr addrspace(1) %a, align 4
  17   %call = call fast float @_Z3sinf(float %tmp)
  18   store float %call, ptr addrspace(1) %a, align 4
  19   %call2 = call fast float @_Z3cosf(float %tmp)
  20   %arrayidx3 = getelementptr inbounds float, ptr addrspace(1) %a, i64 1
  21   store float %call2, ptr addrspace(1) %arrayidx3, align 4
  22   ret void
  23 }
  24
  25 declare float @_Z3sinf(float)
  26
  27 declare float @_Z3cosf(float)
  28
  29 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_sincos_v2
  30 ; GCN-POSTLINK: call fast <2 x float> @_Z3sinDv2_f(
  31 ; GCN-POSTLINK: call fast <2 x float> @_Z3cosDv2_f(
  32 ; GCN-PRELINK: call fast <2 x float> @_Z6sincosDv2_fPU3AS5S_(
  33 ; GCN-NATIVE: call fast <2 x float> @_Z10native_sinDv2_f(
  34 ; GCN-NATIVE: call fast <2 x float> @_Z10native_cosDv2_f(
  35 define amdgpu_kernel void @test_sincos_v2(ptr addrspace(1) nocapture %a) {
  36 entry:
  37   %tmp = load <2 x float>, ptr addrspace(1) %a, align 8
  38   %call = call fast <2 x float> @_Z3sinDv2_f(<2 x float> %tmp)
  39   store <2 x float> %call, ptr addrspace(1) %a, align 8
  40   %call2 = call fast <2 x float> @_Z3cosDv2_f(<2 x float> %tmp)
  41   %arrayidx3 = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i64 1
  42   store <2 x float> %call2, ptr addrspace(1) %arrayidx3, align 8
  43   ret void
  44 }
  45
  46 declare <2 x float> @_Z3sinDv2_f(<2 x float>)
  47
  48 declare <2 x float> @_Z3cosDv2_f(<2 x float>)
  49
  50 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_sincos_v3
  51 ; GCN-POSTLINK: call fast <3 x float> @_Z3sinDv3_f(
  52 ; GCN-POSTLINK: call fast <3 x float> @_Z3cosDv3_f(
  53 ; GCN-PRELINK: call fast <3 x float> @_Z6sincosDv3_fPU3AS5S_(
  54 ; GCN-NATIVE: call fast <3 x float> @_Z10native_sinDv3_f(
  55 ; GCN-NATIVE: call fast <3 x float> @_Z10native_cosDv3_f(
  56 define amdgpu_kernel void @test_sincos_v3(ptr addrspace(1) nocapture %a) {
  57 entry:
  58   %loadVec4 = load <4 x float>, ptr addrspace(1) %a, align 16
  59   %extractVec4 = shufflevector <4 x float> %loadVec4, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
  60   %call = call fast <3 x float> @_Z3sinDv3_f(<3 x float> %extractVec4)
  61   %extractVec6 = shufflevector <3 x float> %call, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
  62   store <4 x float> %extractVec6, ptr addrspace(1) %a, align 16
  63   %call11 = call fast <3 x float> @_Z3cosDv3_f(<3 x float> %extractVec4)
  64   %arrayidx12 = getelementptr inbounds <3 x float>, ptr addrspace(1) %a, i64 1
  65   %extractVec13 = shufflevector <3 x float> %call11, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
  66   store <4 x float> %extractVec13, ptr addrspace(1) %arrayidx12, align 16
  67   ret void
  68 }
  69
  70 declare <3 x float> @_Z3sinDv3_f(<3 x float>)
  71
  72 declare <3 x float> @_Z3cosDv3_f(<3 x float>)
  73
  74 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_sincos_v4
  75 ; GCN-POSTLINK: call fast <4 x float> @_Z3sinDv4_f(
  76 ; GCN-POSTLINK: call fast <4 x float> @_Z3cosDv4_f(
  77 ; GCN-PRELINK: call fast <4 x float> @_Z6sincosDv4_fPU3AS5S_(
  78 ; GCN-NATIVE: call fast <4 x float> @_Z10native_sinDv4_f(
  79 ; GCN-NATIVE: call fast <4 x float> @_Z10native_cosDv4_f(
  80 define amdgpu_kernel void @test_sincos_v4(ptr addrspace(1) nocapture %a) {
  81 entry:
  82   %tmp = load <4 x float>, ptr addrspace(1) %a, align 16
  83   %call = call fast <4 x float> @_Z3sinDv4_f(<4 x float> %tmp)
  84   store <4 x float> %call, ptr addrspace(1) %a, align 16
  85   %call2 = call fast <4 x float> @_Z3cosDv4_f(<4 x float> %tmp)
  86   %arrayidx3 = getelementptr inbounds <4 x float>, ptr addrspace(1) %a, i64 1
  87   store <4 x float> %call2, ptr addrspace(1) %arrayidx3, align 16
  88   ret void
  89 }
  90
  91 declare <4 x float> @_Z3sinDv4_f(<4 x float>)
  92
  93 declare <4 x float> @_Z3cosDv4_f(<4 x float>)
  94
  95 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_sincos_v8
  96 ; GCN-POSTLINK: call fast <8 x float> @_Z3sinDv8_f(
  97 ; GCN-POSTLINK: call fast <8 x float> @_Z3cosDv8_f(
  98 ; GCN-PRELINK: call fast <8 x float> @_Z6sincosDv8_fPU3AS5S_(
  99 ; GCN-NATIVE: call fast <8 x float> @_Z10native_sinDv8_f(
 100 ; GCN-NATIVE: call fast <8 x float> @_Z10native_cosDv8_f(
 101 define amdgpu_kernel void @test_sincos_v8(ptr addrspace(1) nocapture %a) {
 102 entry:
 103   %tmp = load <8 x float>, ptr addrspace(1) %a, align 32
 104   %call = call fast <8 x float> @_Z3sinDv8_f(<8 x float> %tmp)
 105   store <8 x float> %call, ptr addrspace(1) %a, align 32
 106   %call2 = call fast <8 x float> @_Z3cosDv8_f(<8 x float> %tmp)
 107   %arrayidx3 = getelementptr inbounds <8 x float>, ptr addrspace(1) %a, i64 1
 108   store <8 x float> %call2, ptr addrspace(1) %arrayidx3, align 32
 109   ret void
 110 }
 111
 112 declare <8 x float> @_Z3sinDv8_f(<8 x float>)
 113
 114 declare <8 x float> @_Z3cosDv8_f(<8 x float>)
 115
 116 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_sincos_v16
 117 ; GCN-POSTLINK: call fast <16 x float> @_Z3sinDv16_f(
 118 ; GCN-POSTLINK: call fast <16 x float> @_Z3cosDv16_f(
 119 ; GCN-PRELINK: call fast <16 x float> @_Z6sincosDv16_fPU3AS5S_(
 120 ; GCN-NATIVE: call fast <16 x float> @_Z10native_sinDv16_f(
 121 ; GCN-NATIVE: call fast <16 x float> @_Z10native_cosDv16_f(
 122 define amdgpu_kernel void @test_sincos_v16(ptr addrspace(1) nocapture %a) {
 123 entry:
 124   %tmp = load <16 x float>, ptr addrspace(1) %a, align 64
 125   %call = call fast <16 x float> @_Z3sinDv16_f(<16 x float> %tmp)
 126   store <16 x float> %call, ptr addrspace(1) %a, align 64
 127   %call2 = call fast <16 x float> @_Z3cosDv16_f(<16 x float> %tmp)
 128   %arrayidx3 = getelementptr inbounds <16 x float>, ptr addrspace(1) %a, i64 1
 129   store <16 x float> %call2, ptr addrspace(1) %arrayidx3, align 64
 130   ret void
 131 }
 132
 133 declare <16 x float> @_Z3sinDv16_f(<16 x float>)
 134
 135 declare <16 x float> @_Z3cosDv16_f(<16 x float>)
 136
 137 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_native_recip
 138 ; GCN: %call = tail call fast float @_Z12native_recipf(float 3.000000e+00)
 139 define amdgpu_kernel void @test_native_recip(ptr addrspace(1) nocapture %a) {
 140 entry:
 141   %call = call fast float @_Z12native_recipf(float 3.000000e+00)
 142   store float %call, ptr addrspace(1) %a, align 4
 143   ret void
 144 }
 145
 146 declare float @_Z12native_recipf(float)
 147
 148 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_half_recip
 149 ;  GCN: %call = tail call fast float @_Z10half_recipf(float 3.000000e+00)
 150 define amdgpu_kernel void @test_half_recip(ptr addrspace(1) nocapture %a) {
 151 entry:
 152   %call = call fast float @_Z10half_recipf(float 3.000000e+00)
 153   store float %call, ptr addrspace(1) %a, align 4
 154   ret void
 155 }
 156
 157 declare float @_Z10half_recipf(float)
 158
 159 ; Do nothing, the underlying implementation will optimize correctly
 160 ; after inlining.
 161 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_native_divide
 162 ; GCN: %call = tail call fast float @_Z13native_divideff(float %tmp, float 3.000000e+00)
 163 define amdgpu_kernel void @test_native_divide(ptr addrspace(1) nocapture %a) {
 164 entry:
 165   %tmp = load float, ptr addrspace(1) %a, align 4
 166   %call = call fast float @_Z13native_divideff(float %tmp, float 3.000000e+00)
 167   store float %call, ptr addrspace(1) %a, align 4
 168   ret void
 169 }
 170
 171 declare float @_Z13native_divideff(float, float)
 172
 173 ; Do nothing, the optimization will naturally happen after inlining.
 174
 175 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_half_divide
 176 ; GCN: %call = tail call fast float @_Z11half_divideff(float %tmp, float 3.000000e+00)
 177 define amdgpu_kernel void @test_half_divide(ptr addrspace(1) nocapture %a) {
 178 entry:
 179   %tmp = load float, ptr addrspace(1) %a, align 4
 180   %call = call fast float @_Z11half_divideff(float %tmp, float 3.000000e+00)
 181   store float %call, ptr addrspace(1) %a, align 4
 182   ret void
 183 }
 184
 185 declare float @_Z11half_divideff(float, float)
 186
 187 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow_0f
 188 ; GCN: store float 1.000000e+00, ptr addrspace(1) %a
 189 define amdgpu_kernel void @test_pow_0f(ptr addrspace(1) nocapture %a) {
 190 entry:
 191   %tmp = load float, ptr addrspace(1) %a, align 4
 192   %call = call fast float @_Z3powff(float %tmp, float 0.000000e+00)
 193   store float %call, ptr addrspace(1) %a, align 4
 194   ret void
 195 }
 196
 197 declare float @_Z3powff(float, float)
 198
 199 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow_0i
 200 ; GCN: store float 1.000000e+00, ptr addrspace(1) %a
 201 define amdgpu_kernel void @test_pow_0i(ptr addrspace(1) nocapture %a) {
 202 entry:
 203   %tmp = load float, ptr addrspace(1) %a, align 4
 204   %call = call fast float @_Z3powff(float %tmp, float 0.000000e+00)
 205   store float %call, ptr addrspace(1) %a, align 4
 206   ret void
 207 }
 208
 209 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow_1f
 210 ; GCN: %tmp = load float, ptr addrspace(1) %arrayidx, align 4
 211 ; GCN: store float %tmp, ptr addrspace(1) %a, align 4
 212 define amdgpu_kernel void @test_pow_1f(ptr addrspace(1) nocapture %a) {
 213 entry:
 214   %arrayidx = getelementptr inbounds float, ptr addrspace(1) %a, i64 1
 215   %tmp = load float, ptr addrspace(1) %arrayidx, align 4
 216   %call = call fast float @_Z3powff(float %tmp, float 1.000000e+00)
 217   store float %call, ptr addrspace(1) %a, align 4
 218   ret void
 219 }
 220
 221 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow_1i
 222 ; GCN: %tmp = load float, ptr addrspace(1) %arrayidx, align 4
 223 ; GCN: store float %tmp, ptr addrspace(1) %a, align 4
 224 define amdgpu_kernel void @test_pow_1i(ptr addrspace(1) nocapture %a) {
 225 entry:
 226   %arrayidx = getelementptr inbounds float, ptr addrspace(1) %a, i64 1
 227   %tmp = load float, ptr addrspace(1) %arrayidx, align 4
 228   %call = call fast float @_Z3powff(float %tmp, float 1.000000e+00)
 229   store float %call, ptr addrspace(1) %a, align 4
 230   ret void
 231 }
 232
 233 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow_2f
 234 ; GCN: %tmp = load float, ptr addrspace(1) %a, align 4
 235 ; GCN: %__pow2 = fmul fast float %tmp, %tmp
 236 define amdgpu_kernel void @test_pow_2f(ptr addrspace(1) nocapture %a) {
 237 entry:
 238   %tmp = load float, ptr addrspace(1) %a, align 4
 239   %call = call fast float @_Z3powff(float %tmp, float 2.000000e+00)
 240   store float %call, ptr addrspace(1) %a, align 4
 241   ret void
 242 }
 243
 244 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow_2i
 245 ; GCN: %tmp = load float, ptr addrspace(1) %a, align 4
 246 ; GCN: %__pow2 = fmul fast float %tmp, %tmp
 247 define amdgpu_kernel void @test_pow_2i(ptr addrspace(1) nocapture %a) {
 248 entry:
 249   %tmp = load float, ptr addrspace(1) %a, align 4
 250   %call = call fast float @_Z3powff(float %tmp, float 2.000000e+00)
 251   store float %call, ptr addrspace(1) %a, align 4
 252   ret void
 253 }
 254
 255 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow_m1f
 256 ; GCN: %tmp = load float, ptr addrspace(1) %arrayidx, align 4
 257 ; GCN: %__powrecip = fdiv fast float 1.000000e+00, %tmp
 258 define amdgpu_kernel void @test_pow_m1f(ptr addrspace(1) nocapture %a) {
 259 entry:
 260   %arrayidx = getelementptr inbounds float, ptr addrspace(1) %a, i64 1
 261   %tmp = load float, ptr addrspace(1) %arrayidx, align 4
 262   %call = call fast float @_Z3powff(float %tmp, float -1.000000e+00)
 263   store float %call, ptr addrspace(1) %a, align 4
 264   ret void
 265 }
 266
 267 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow_m1i
 268 ; GCN: %tmp = load float, ptr addrspace(1) %arrayidx, align 4
 269 ; GCN: %__powrecip = fdiv fast float 1.000000e+00, %tmp
 270 define amdgpu_kernel void @test_pow_m1i(ptr addrspace(1) nocapture %a) {
 271 entry:
 272   %arrayidx = getelementptr inbounds float, ptr addrspace(1) %a, i64 1
 273   %tmp = load float, ptr addrspace(1) %arrayidx, align 4
 274   %call = call fast float @_Z3powff(float %tmp, float -1.000000e+00)
 275   store float %call, ptr addrspace(1) %a, align 4
 276   ret void
 277 }
 278
 279 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow_half
 280 ; GCN-POSTLINK: call fast float @_Z3powff(float %tmp, float 5.000000e-01)
 281 ; GCN-PRELINK: %__pow2sqrt = tail call fast float @_Z4sqrtf(float %tmp)
 282 define amdgpu_kernel void @test_pow_half(ptr addrspace(1) nocapture %a) {
 283 entry:
 284   %arrayidx = getelementptr inbounds float, ptr addrspace(1) %a, i64 1
 285   %tmp = load float, ptr addrspace(1) %arrayidx, align 4
 286   %call = call fast float @_Z3powff(float %tmp, float 5.000000e-01)
 287   store float %call, ptr addrspace(1) %a, align 4
 288   ret void
 289 }
 290
 291 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow_mhalf
 292 ; GCN-POSTLINK: call fast float @_Z3powff(float %tmp, float -5.000000e-01)
 293 ; GCN-PRELINK: %__pow2rsqrt = tail call fast float @_Z5rsqrtf(float %tmp)
 294 define amdgpu_kernel void @test_pow_mhalf(ptr addrspace(1) nocapture %a) {
 295 entry:
 296   %arrayidx = getelementptr inbounds float, ptr addrspace(1) %a, i64 1
 297   %tmp = load float, ptr addrspace(1) %arrayidx, align 4
 298   %call = call fast float @_Z3powff(float %tmp, float -5.000000e-01)
 299   store float %call, ptr addrspace(1) %a, align 4
 300   ret void
 301 }
 302
 303 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow_c
 304 ; GCN: %__powx2 = fmul fast float %tmp, %tmp
 305 ; GCN: %__powx21 = fmul fast float %__powx2, %__powx2
 306 ; GCN: %__powx22 = fmul fast float %__powx2, %tmp
 307 ; GCN: %[[r0:.*]] = fmul fast float %__powx21, %__powx21
 308 ; GCN: %__powprod3 = fmul fast float %[[r0]], %__powx22
 309 define amdgpu_kernel void @test_pow_c(ptr addrspace(1) nocapture %a) {
 310 entry:
 311   %arrayidx = getelementptr inbounds float, ptr addrspace(1) %a, i64 1
 312   %tmp = load float, ptr addrspace(1) %arrayidx, align 4
 313   %call = call fast float @_Z3powff(float %tmp, float 1.100000e+01)
 314   store float %call, ptr addrspace(1) %a, align 4
 315   ret void
 316 }
 317
 318 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_powr_c
 319 ; GCN: %__powx2 = fmul fast float %tmp, %tmp
 320 ; GCN: %__powx21 = fmul fast float %__powx2, %__powx2
 321 ; GCN: %__powx22 = fmul fast float %__powx2, %tmp
 322 ; GCN: %[[r0:.*]] = fmul fast float %__powx21, %__powx21
 323 ; GCN: %__powprod3 = fmul fast float %[[r0]], %__powx22
 324 define amdgpu_kernel void @test_powr_c(ptr addrspace(1) nocapture %a) {
 325 entry:
 326   %arrayidx = getelementptr inbounds float, ptr addrspace(1) %a, i64 1
 327   %tmp = load float, ptr addrspace(1) %arrayidx, align 4
 328   %call = call fast float @_Z4powrff(float %tmp, float 1.100000e+01)
 329   store float %call, ptr addrspace(1) %a, align 4
 330   ret void
 331 }
 332
 333 declare float @_Z4powrff(float, float)
 334
 335 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pown_c
 336 ; GCN: %__powx2 = fmul fast float %tmp, %tmp
 337 ; GCN: %__powx21 = fmul fast float %__powx2, %__powx2
 338 ; GCN: %__powx22 = fmul fast float %__powx2, %tmp
 339 ; GCN: %[[r0:.*]] = fmul fast float %__powx21, %__powx21
 340 ; GCN: %__powprod3 = fmul fast float %[[r0]], %__powx22
 341 define amdgpu_kernel void @test_pown_c(ptr addrspace(1) nocapture %a) {
 342 entry:
 343   %arrayidx = getelementptr inbounds float, ptr addrspace(1) %a, i64 1
 344   %tmp = load float, ptr addrspace(1) %arrayidx, align 4
 345   %call = call fast float @_Z4pownfi(float %tmp, i32 11)
 346   store float %call, ptr addrspace(1) %a, align 4
 347   ret void
 348 }
 349
 350 declare half @_Z4pownDhi(half, i32)
 351
 352 ; GCN-LABEL: {{^}}define half @test_pown_f16(
 353 ; GCN-NATIVE: %__fabs = tail call fast half @llvm.fabs.f16(half %x)
 354 ; GCN-NATIVE: %__log2 = tail call fast half @llvm.log2.f16(half %__fabs)
 355 ; GCN-NATIVE: %pownI2F = sitofp i32 %y to half
 356 ; GCN-NATIVE: %__ylogx = fmul fast half %__log2, %pownI2F
 357 ; GCN-NATIVE: %__exp2 = tail call fast half @llvm.exp2.f16(half %__ylogx)
 358 ; GCN-NATIVE: %__ytou = trunc i32 %y to i16
 359 ; GCN-NATIVE: %__yeven = shl i16 %__ytou, 15
 360 ; GCN-NATIVE: %0 = bitcast half %x to i16
 361 ; GCN-NATIVE: %__pow_sign = and i16 %__yeven, %0
 362 ; GCN-NATIVE: %1 = bitcast half %__exp2 to i16
 363 ; GCN-NATIVE: %2 = or i16 %__pow_sign, %1
 364 ; GCN-NATIVE: %3 = bitcast i16 %2 to half
 365 define half @test_pown_f16(half %x, i32 %y) {
 366 entry:
 367   %call = call fast half @_Z4pownDhi(half %x, i32 %y)
 368   ret half %call
 369 }
 370
 371 declare float @_Z4pownfi(float, i32)
 372
 373 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow
 374 ; GCN: %__fabs = tail call fast float @llvm.fabs.f32(float %tmp)
 375 ; GCN: %__log2 = tail call fast float @llvm.log2.f32(float %__fabs)
 376 ; GCN: %__ylogx = fmul fast float %__log2, 1.013000e+03
 377 ; GCN: %__exp2 = tail call fast float @llvm.exp2.f32(float %__ylogx)
 378 ; GCN: %[[r0:.*]] = bitcast float %tmp to i32
 379 ; GCN: %__pow_sign = and i32 %[[r0]], -2147483648
 380 ; GCN: %[[r1:.*]] = bitcast float %__exp2 to i32
 381 ; GCN: %[[r2:.*]] = or i32 %__pow_sign, %[[r1]]
 382 ; GCN: store i32 %[[r2]], ptr addrspace(1) %a, align 4
 383 define amdgpu_kernel void @test_pow(ptr addrspace(1) nocapture %a) {
 384 entry:
 385   %tmp = load float, ptr addrspace(1) %a, align 4
 386   %call = call fast float @_Z3powff(float %tmp, float 1.013000e+03)
 387   store float %call, ptr addrspace(1) %a, align 4
 388   ret void
 389 }
 390
 391 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_powr
 392 ; GCN: %__log2 = tail call fast float @llvm.log2.f32(float %tmp)
 393 ; GCN: %__ylogx = fmul fast float %tmp1, %__log2
 394 ; GCN: %__exp2 = tail call fast float @llvm.exp2.f32(float %__ylogx)
 395 ; GCN: store float %__exp2, ptr addrspace(1) %a, align 4
 396 define amdgpu_kernel void @test_powr(ptr addrspace(1) nocapture %a) {
 397 entry:
 398   %tmp = load float, ptr addrspace(1) %a, align 4
 399   %arrayidx1 = getelementptr inbounds float, ptr addrspace(1) %a, i64 1
 400   %tmp1 = load float, ptr addrspace(1) %arrayidx1, align 4
 401   %call = call fast float @_Z4powrff(float %tmp, float %tmp1)
 402   store float %call, ptr addrspace(1) %a, align 4
 403   ret void
 404 }
 405
 406 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pown
 407 ; GCN: %conv = fptosi float %tmp1 to i32
 408 ; GCN: %__fabs = tail call fast float @llvm.fabs.f32(float %tmp)
 409 ; GCN: %__log2 = tail call fast float @llvm.log2.f32(float %__fabs)
 410 ; GCN: %pownI2F = sitofp i32 %conv to float
 411 ; GCN: %__ylogx = fmul fast float %__log2, %pownI2F
 412 ; GCN: %__exp2 = tail call fast float @llvm.exp2.f32(float %__ylogx)
 413 ; GCN: %__yeven = shl i32 %conv, 31
 414 ; GCN: %[[r0:.*]] = bitcast float %tmp to i32
 415 ; GCN: %__pow_sign = and i32 %__yeven, %[[r0]]
 416 ; GCN: %[[r1:.*]] = bitcast float %__exp2 to i32
 417 ; GCN: %[[r2:.*]] = or i32 %__pow_sign, %[[r1]]
 418 ; GCN: store i32 %[[r2]], ptr addrspace(1) %a, align 4
 419 define amdgpu_kernel void @test_pown(ptr addrspace(1) nocapture %a) {
 420 entry:
 421   %tmp = load float, ptr addrspace(1) %a, align 4
 422   %arrayidx1 = getelementptr inbounds float, ptr addrspace(1) %a, i64 1
 423   %tmp1 = load float, ptr addrspace(1) %arrayidx1, align 4
 424   %conv = fptosi float %tmp1 to i32
 425   %call = call fast float @_Z4pownfi(float %tmp, i32 %conv)
 426   store float %call, ptr addrspace(1) %a, align 4
 427   ret void
 428 }
 429
 430 declare half @_Z3powDhDh(half, half)
 431 declare <2 x half> @_Z3powDv2_DhS_(<2 x half>, <2 x half>)
 432
 433 ; GCN-LABEL: define half @test_pow_fast_f16__y_13(half %x)
 434 ; GCN: %__fabs = tail call fast half @llvm.fabs.f16(half %x)
 435 ; GCN: %__log2 = tail call fast half @llvm.log2.f16(half %__fabs)
 436 ; GCN: %__ylogx = fmul fast half %__log2, 0xH4A80
 437 ; GCN: %__exp2 = tail call fast half @llvm.exp2.f16(half %__ylogx)
 438 ; GCN: %1 = bitcast half %x to i16
 439 ; GCN: %__pow_sign = and i16 %1, -32768
 440 ; GCN: %2 = bitcast half %__exp2 to i16
 441 ; GCN: %3 = or i16 %__pow_sign, %2
 442 ; GCN: %4 = bitcast i16 %3 to half
 443 define half @test_pow_fast_f16__y_13(half %x) {
 444   %powr = tail call fast half @_Z3powDhDh(half %x, half 13.0)
 445   ret half %powr
 446 }
 447
 448 ; GCN-LABEL: define <2 x half> @test_pow_fast_v2f16__y_13(<2 x half> %x)
 449 ; GCN: %__fabs = tail call fast <2 x half> @llvm.fabs.v2f16(<2 x half> %x)
 450 ; GCN: %__log2 = tail call fast <2 x half> @llvm.log2.v2f16(<2 x half> %__fabs)
 451 ; GCN: %__ylogx = fmul fast <2 x half> %__log2, <half 0xH4A80, half 0xH4A80>
 452 ; GCN: %__exp2 = tail call fast <2 x half> @llvm.exp2.v2f16(<2 x half> %__ylogx)
 453 ; GCN: %1 = bitcast <2 x half> %x to <2 x i16>
 454 ; GCN: %__pow_sign = and <2 x i16> %1, <i16 -32768, i16 -32768>
 455 ; GCN: %2 = bitcast <2 x half> %__exp2 to <2 x i16>
 456 ; GCN: %3 = or <2 x i16> %__pow_sign, %2
 457 ; GCN: %4 = bitcast <2 x i16> %3 to <2 x half>
 458 define <2 x half> @test_pow_fast_v2f16__y_13(<2 x half> %x) {
 459   %powr = tail call fast <2 x half> @_Z3powDv2_DhS_(<2 x half> %x, <2 x half> <half 13.0, half 13.0>)
 460   ret <2 x half> %powr
 461 }
 462
 463 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_rootn_1
 464 ; GCN: %tmp = load float, ptr addrspace(1) %arrayidx, align 4
 465 ; GCN: store float %tmp, ptr addrspace(1) %a, align 4
 466 define amdgpu_kernel void @test_rootn_1(ptr addrspace(1) nocapture %a) {
 467 entry:
 468   %arrayidx = getelementptr inbounds float, ptr addrspace(1) %a, i64 1
 469   %tmp = load float, ptr addrspace(1) %arrayidx, align 4
 470   %call = call fast float @_Z5rootnfi(float %tmp, i32 1)
 471   store float %call, ptr addrspace(1) %a, align 4
 472   ret void
 473 }
 474
 475 declare float @_Z5rootnfi(float, i32)
 476
 477 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_rootn_2
 478 ; GCN-POSTLINK: call fast float @_Z5rootnfi(float %tmp, i32 2)
 479 ; GCN-PRELINK: %__rootn2sqrt = tail call fast float @_Z4sqrtf(float %tmp)
 480 define amdgpu_kernel void @test_rootn_2(ptr addrspace(1) nocapture %a) {
 481 entry:
 482   %tmp = load float, ptr addrspace(1) %a, align 4
 483   %call = call fast float @_Z5rootnfi(float %tmp, i32 2)
 484   store float %call, ptr addrspace(1) %a, align 4
 485   ret void
 486 }
 487
 488 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_rootn_3
 489 ; GCN-POSTLINK: call fast float @_Z5rootnfi(float %tmp, i32 3)
 490 ; GCN-PRELINK: %__rootn2cbrt = tail call fast float @_Z4cbrtf(float %tmp)
 491 define amdgpu_kernel void @test_rootn_3(ptr addrspace(1) nocapture %a) {
 492 entry:
 493   %tmp = load float, ptr addrspace(1) %a, align 4
 494   %call = call fast float @_Z5rootnfi(float %tmp, i32 3)
 495   store float %call, ptr addrspace(1) %a, align 4
 496   ret void
 497 }
 498
 499 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_rootn_m1
 500 ; GCN: fdiv fast float 1.000000e+00, %tmp
 501 define amdgpu_kernel void @test_rootn_m1(ptr addrspace(1) nocapture %a) {
 502 entry:
 503   %tmp = load float, ptr addrspace(1) %a, align 4
 504   %call = call fast float @_Z5rootnfi(float %tmp, i32 -1)
 505   store float %call, ptr addrspace(1) %a, align 4
 506   ret void
 507 }
 508
 509 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_rootn_m2
 510 ; GCN-POSTLINK: call fast float @_Z5rootnfi(float %tmp, i32 -2)
 511 ; GCN-PRELINK: %__rootn2rsqrt = tail call fast float @_Z5rsqrtf(float %tmp)
 512 define amdgpu_kernel void @test_rootn_m2(ptr addrspace(1) nocapture %a) {
 513 entry:
 514   %tmp = load float, ptr addrspace(1) %a, align 4
 515   %call = call fast float @_Z5rootnfi(float %tmp, i32 -2)
 516   store float %call, ptr addrspace(1) %a, align 4
 517   ret void
 518 }
 519
 520 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_fma_0x
 521 ; GCN: store float %y
 522 define amdgpu_kernel void @test_fma_0x(ptr addrspace(1) nocapture %a, float %y) {
 523 entry:
 524   %tmp = load float, ptr addrspace(1) %a, align 4
 525   %call = call fast float @_Z3fmafff(float 0.000000e+00, float %tmp, float %y)
 526   store float %call, ptr addrspace(1) %a, align 4
 527   ret void
 528 }
 529
 530 declare float @_Z3fmafff(float, float, float)
 531
 532 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_fma_x0
 533 ; GCN: store float %y,
 534 define amdgpu_kernel void @test_fma_x0(ptr addrspace(1) nocapture %a, float %y) {
 535 entry:
 536   %tmp = load float, ptr addrspace(1) %a, align 4
 537   %call = call fast float @_Z3fmafff(float %tmp, float 0.000000e+00, float %y)
 538   store float %call, ptr addrspace(1) %a, align 4
 539   ret void
 540 }
 541
 542 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_mad_0x
 543 ; GCN: store float %y,
 544 define amdgpu_kernel void @test_mad_0x(ptr addrspace(1) nocapture %a, float %y) {
 545 entry:
 546   %tmp = load float, ptr addrspace(1) %a, align 4
 547   %call = call fast float @_Z3madfff(float 0.000000e+00, float %tmp, float %y)
 548   store float %call, ptr addrspace(1) %a, align 4
 549   ret void
 550 }
 551
 552 declare float @_Z3madfff(float, float, float)
 553
 554 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_mad_x0
 555 ; GCN: store float %y,
 556 define amdgpu_kernel void @test_mad_x0(ptr addrspace(1) nocapture %a, float %y) {
 557 entry:
 558   %tmp = load float, ptr addrspace(1) %a, align 4
 559   %call = call fast float @_Z3madfff(float %tmp, float 0.000000e+00, float %y)
 560   store float %call, ptr addrspace(1) %a, align 4
 561   ret void
 562 }
 563
 564 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_fma_x1y
 565 ; GCN: %call = fadd fast float %tmp, %y
 566 define amdgpu_kernel void @test_fma_x1y(ptr addrspace(1) nocapture %a, float %y) {
 567 entry:
 568   %tmp = load float, ptr addrspace(1) %a, align 4
 569   %call = call fast float @_Z3fmafff(float %tmp, float 1.000000e+00, float %y)
 570   store float %call, ptr addrspace(1) %a, align 4
 571   ret void
 572 }
 573
 574 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_fma_1xy
 575 ; GCN: %call = fadd fast float %tmp, %y
 576 define amdgpu_kernel void @test_fma_1xy(ptr addrspace(1) nocapture %a, float %y) {
 577 entry:
 578   %tmp = load float, ptr addrspace(1) %a, align 4
 579   %call = call fast float @_Z3fmafff(float 1.000000e+00, float %tmp, float %y)
 580   store float %call, ptr addrspace(1) %a, align 4
 581   ret void
 582 }
 583
 584 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_fma_xy0
 585 ; GCN: %call = fmul fast float %tmp1, %tmp
 586 define amdgpu_kernel void @test_fma_xy0(ptr addrspace(1) nocapture %a) {
 587 entry:
 588   %arrayidx = getelementptr inbounds float, ptr addrspace(1) %a, i64 1
 589   %tmp = load float, ptr addrspace(1) %arrayidx, align 4
 590   %tmp1 = load float, ptr addrspace(1) %a, align 4
 591   %call = call fast float @_Z3fmafff(float %tmp, float %tmp1, float 0.000000e+00)
 592   store float %call, ptr addrspace(1) %a, align 4
 593   ret void
 594 }
 595
 596 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_exp
 597 ; GCN-NATIVE: call fast float @llvm.exp.f32(float %tmp)
 598 define amdgpu_kernel void @test_use_native_exp(ptr addrspace(1) nocapture %a) {
 599 entry:
 600   %tmp = load float, ptr addrspace(1) %a, align 4
 601   %call = call fast float @_Z3expf(float %tmp)
 602   store float %call, ptr addrspace(1) %a, align 4
 603   ret void
 604 }
 605
 606 declare float @_Z3expf(float)
 607
 608 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_exp2
 609 ; GCN-NATIVE: call fast float @llvm.exp2.f32(float %tmp)
 610 define amdgpu_kernel void @test_use_native_exp2(ptr addrspace(1) nocapture %a) {
 611 entry:
 612   %tmp = load float, ptr addrspace(1) %a, align 4
 613   %call = call fast float @_Z4exp2f(float %tmp)
 614   store float %call, ptr addrspace(1) %a, align 4
 615   ret void
 616 }
 617
 618 declare float @_Z4exp2f(float)
 619
 620 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_exp10
 621 ; GCN-NATIVE: call fast float @_Z12native_exp10f(float %tmp)
 622 define amdgpu_kernel void @test_use_native_exp10(ptr addrspace(1) nocapture %a) {
 623 entry:
 624   %tmp = load float, ptr addrspace(1) %a, align 4
 625   %call = call fast float @_Z5exp10f(float %tmp)
 626   store float %call, ptr addrspace(1) %a, align 4
 627   ret void
 628 }
 629
 630 declare float @_Z5exp10f(float)
 631
 632 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_log
 633 ; GCN-NATIVE: call fast float @llvm.log.f32(float %tmp)
 634 define amdgpu_kernel void @test_use_native_log(ptr addrspace(1) nocapture %a) {
 635 entry:
 636   %tmp = load float, ptr addrspace(1) %a, align 4
 637   %call = call fast float @_Z3logf(float %tmp)
 638   store float %call, ptr addrspace(1) %a, align 4
 639   ret void
 640 }
 641
 642 declare float @_Z3logf(float)
 643
 644 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_log2
 645 ; GCN-NATIVE: call fast float @llvm.log2.f32(float %tmp)
 646 define amdgpu_kernel void @test_use_native_log2(ptr addrspace(1) nocapture %a) {
 647 entry:
 648   %tmp = load float, ptr addrspace(1) %a, align 4
 649   %call = call fast float @_Z4log2f(float %tmp)
 650   store float %call, ptr addrspace(1) %a, align 4
 651   ret void
 652 }
 653
 654 declare float @_Z4log2f(float)
 655
 656 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_log10
 657 ; GCN-NATIVE: call fast float @llvm.log10.f32(float %tmp)
 658 define amdgpu_kernel void @test_use_native_log10(ptr addrspace(1) nocapture %a) {
 659 entry:
 660   %tmp = load float, ptr addrspace(1) %a, align 4
 661   %call = call fast float @_Z5log10f(float %tmp)
 662   store float %call, ptr addrspace(1) %a, align 4
 663   ret void
 664 }
 665
 666 declare float @_Z5log10f(float)
 667
 668 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_powr
 669 ; GCN: %tmp1 = load float, ptr addrspace(1) %arrayidx1, align 4
 670 ; GCN: %__log2 = tail call fast float @llvm.log2.f32(float %tmp)
 671 ; GCN: %__ylogx = fmul fast float %tmp1, %__log2
 672 ; GCN: %__exp2 = tail call fast float @llvm.exp2.f32(float %__ylogx)
 673 ; GCN: store float %__exp2, ptr addrspace(1) %a, align 4
 674 define amdgpu_kernel void @test_use_native_powr(ptr addrspace(1) nocapture %a) {
 675 entry:
 676   %tmp = load float, ptr addrspace(1) %a, align 4
 677   %arrayidx1 = getelementptr inbounds float, ptr addrspace(1) %a, i64 1
 678   %tmp1 = load float, ptr addrspace(1) %arrayidx1, align 4
 679   %call = call fast float @_Z4powrff(float %tmp, float %tmp1)
 680   store float %call, ptr addrspace(1) %a, align 4
 681   ret void
 682 }
 683
 684 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_powr_nobuiltin
 685 ; GCN: %call = tail call fast float @_Z4powrff(float %tmp, float %tmp1)
 686 define amdgpu_kernel void @test_use_native_powr_nobuiltin(ptr addrspace(1) nocapture %a) {
 687 entry:
 688   %tmp = load float, ptr addrspace(1) %a, align 4
 689   %arrayidx1 = getelementptr inbounds float, ptr addrspace(1) %a, i64 1
 690   %tmp1 = load float, ptr addrspace(1) %arrayidx1, align 4
 691   %call = call fast float @_Z4powrff(float %tmp, float %tmp1) nobuiltin
 692   store float %call, ptr addrspace(1) %a, align 4
 693   ret void
 694 }
 695
 696 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_sqrt
 697 ; GCN-NATIVE: call fast float @_Z11native_sqrtf(float %tmp)
 698 define amdgpu_kernel void @test_use_native_sqrt(ptr addrspace(1) nocapture %a) {
 699 entry:
 700   %tmp = load float, ptr addrspace(1) %a, align 4
 701   %call = call fast float @_Z4sqrtf(float %tmp)
 702   store float %call, ptr addrspace(1) %a, align 4
 703   ret void
 704 }
 705
 706 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_dont_use_native_sqrt_fast_f64
 707 ; GCN: call fast double @_Z4sqrtd(double %tmp)
 708 define amdgpu_kernel void @test_dont_use_native_sqrt_fast_f64(ptr addrspace(1) nocapture %a) {
 709 entry:
 710   %tmp = load double, ptr addrspace(1) %a, align 8
 711   %call = call fast double @_Z4sqrtd(double %tmp)
 712   store double %call, ptr addrspace(1) %a, align 8
 713   ret void
 714 }
 715
 716 declare float @_Z4sqrtf(float)
 717 declare double @_Z4sqrtd(double)
 718
 719 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_rsqrt
 720 ; GCN-NATIVE: call fast float @_Z12native_rsqrtf(float %tmp)
 721 define amdgpu_kernel void @test_use_native_rsqrt(ptr addrspace(1) nocapture %a) {
 722 entry:
 723   %tmp = load float, ptr addrspace(1) %a, align 4
 724   %call = call fast float @_Z5rsqrtf(float %tmp)
 725   store float %call, ptr addrspace(1) %a, align 4
 726   ret void
 727 }
 728
 729 declare float @_Z5rsqrtf(float)
 730
 731 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_tan
 732 ; GCN-NATIVE: call fast float @_Z10native_tanf(float %tmp)
 733 define amdgpu_kernel void @test_use_native_tan(ptr addrspace(1) nocapture %a) {
 734 entry:
 735   %tmp = load float, ptr addrspace(1) %a, align 4
 736   %call = call fast float @_Z3tanf(float %tmp)
 737   store float %call, ptr addrspace(1) %a, align 4
 738   ret void
 739 }
 740
 741 declare float @_Z3tanf(float)
 742
 743 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_sincos
 744 ; GCN-NATIVE: call float @_Z10native_sinf(float %tmp)
 745 ; GCN-NATIVE: call float @_Z10native_cosf(float %tmp)
 746 define amdgpu_kernel void @test_use_native_sincos(ptr addrspace(1) %a) {
 747 entry:
 748   %tmp = load float, ptr addrspace(1) %a, align 4
 749   %arrayidx1 = getelementptr inbounds float, ptr addrspace(1) %a, i64 1
 750   %tmp1 = addrspacecast ptr addrspace(1) %arrayidx1 to ptr
 751   %call = call fast float @_Z6sincosfPf(float %tmp, ptr %tmp1)
 752   store float %call, ptr addrspace(1) %a, align 4
 753   ret void
 754 }
 755
 756 declare float @_Z6sincosfPf(float, ptr)
 757
 758 %opencl.pipe_t = type opaque
 759 %opencl.reserve_id_t = type opaque
 760
 761 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_read_pipe(ptr addrspace(1) %p, ptr addrspace(1) %ptr)
 762 ; GCN-PRELINK: call i32 @__read_pipe_2_4(ptr addrspace(1) %{{.*}}, ptr %{{.*}}) #[[$NOUNWIND:[0-9]+]]
 763 ; GCN-PRELINK: call i32 @__read_pipe_4_4(ptr addrspace(1) %{{.*}}, ptr addrspace(5) %{{.*}}, i32 2, ptr %{{.*}}) #[[$NOUNWIND]]
 764 define amdgpu_kernel void @test_read_pipe(ptr addrspace(1) %p, ptr addrspace(1) %ptr) local_unnamed_addr {
 765 entry:
 766   %tmp1 = addrspacecast ptr addrspace(1) %ptr to ptr
 767   %tmp2 = call i32 @__read_pipe_2(ptr addrspace(1) %p, ptr %tmp1, i32 4, i32 4) #0
 768   %tmp3 = call ptr addrspace(5) @__reserve_read_pipe(ptr addrspace(1) %p, i32 2, i32 4, i32 4)
 769   %tmp4 = call i32 @__read_pipe_4(ptr addrspace(1) %p, ptr addrspace(5) %tmp3, i32 2, ptr %tmp1, i32 4, i32 4) #0
 770   call void @__commit_read_pipe(ptr addrspace(1) %p, ptr addrspace(5) %tmp3, i32 4, i32 4)
 771   ret void
 772 }
 773
 774 declare i32 @__read_pipe_2(ptr addrspace(1), ptr, i32, i32)
 775
 776 declare ptr addrspace(5) @__reserve_read_pipe(ptr addrspace(1), i32, i32, i32)
 777
 778 declare i32 @__read_pipe_4(ptr addrspace(1), ptr addrspace(5), i32, ptr, i32, i32)
 779
 780 declare void @__commit_read_pipe(ptr addrspace(1), ptr addrspace(5), i32, i32)
 781
 782 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_write_pipe(ptr addrspace(1) %p, ptr addrspace(1) %ptr)
 783 ; GCN-PRELINK: call i32 @__write_pipe_2_4(ptr addrspace(1) %{{.*}}, ptr %{{.*}}) #[[$NOUNWIND]]
 784 ; GCN-PRELINK: call i32 @__write_pipe_4_4(ptr addrspace(1) %{{.*}}, ptr addrspace(5) %{{.*}}, i32 2, ptr %{{.*}}) #[[$NOUNWIND]]
 785 define amdgpu_kernel void @test_write_pipe(ptr addrspace(1) %p, ptr addrspace(1) %ptr) local_unnamed_addr {
 786 entry:
 787   %tmp1 = addrspacecast ptr addrspace(1) %ptr to ptr
 788   %tmp2 = call i32 @__write_pipe_2(ptr addrspace(1) %p, ptr %tmp1, i32 4, i32 4) #0
 789   %tmp3 = call ptr addrspace(5) @__reserve_write_pipe(ptr addrspace(1) %p, i32 2, i32 4, i32 4) #0
 790   %tmp4 = call i32 @__write_pipe_4(ptr addrspace(1) %p, ptr addrspace(5) %tmp3, i32 2, ptr %tmp1, i32 4, i32 4) #0
 791   call void @__commit_write_pipe(ptr addrspace(1) %p, ptr addrspace(5) %tmp3, i32 4, i32 4) #0
 792   ret void
 793 }
 794
 795 declare i32 @__write_pipe_2(ptr addrspace(1), ptr, i32, i32) local_unnamed_addr
 796
 797 declare ptr addrspace(5) @__reserve_write_pipe(ptr addrspace(1), i32, i32, i32) local_unnamed_addr
 798
 799 declare i32 @__write_pipe_4(ptr addrspace(1), ptr addrspace(5), i32, ptr, i32, i32) local_unnamed_addr
 800
 801 declare void @__commit_write_pipe(ptr addrspace(1), ptr addrspace(5), i32, i32) local_unnamed_addr
 802
 803 %struct.S = type { [100 x i32] }
 804
 805 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pipe_size
 806 ; GCN-PRELINK: call i32 @__read_pipe_2_1(ptr addrspace(1) %{{.*}} ptr %{{.*}}) #[[$NOUNWIND]]
 807 ; GCN-PRELINK: call i32 @__read_pipe_2_2(ptr addrspace(1) %{{.*}} ptr %{{.*}}) #[[$NOUNWIND]]
 808 ; GCN-PRELINK: call i32 @__read_pipe_2_4(ptr addrspace(1) %{{.*}} ptr %{{.*}}) #[[$NOUNWIND]]
 809 ; GCN-PRELINK: call i32 @__read_pipe_2_8(ptr addrspace(1) %{{.*}} ptr %{{.*}}) #[[$NOUNWIND]]
 810 ; GCN-PRELINK: call i32 @__read_pipe_2_16(ptr addrspace(1) %{{.*}}, ptr %{{.*}}) #[[$NOUNWIND]]
 811 ; GCN-PRELINK: call i32 @__read_pipe_2_32(ptr addrspace(1) %{{.*}}, ptr %{{.*}} #[[$NOUNWIND]]
 812 ; GCN-PRELINK: call i32 @__read_pipe_2_64(ptr addrspace(1) %{{.*}}, ptr %{{.*}} #[[$NOUNWIND]]
 813 ; GCN-PRELINK: call i32 @__read_pipe_2_128(ptr addrspace(1) %{{.*}}, ptr %{{.*}} #[[$NOUNWIND]]
 814 ; GCN-PRELINK: call i32 @__read_pipe_2(ptr addrspace(1) %{{.*}}, ptr %{{.*}} i32 400, i32 4) #[[$NOUNWIND]]
 815 define amdgpu_kernel void @test_pipe_size(ptr addrspace(1) %p1, ptr addrspace(1) %ptr1, ptr addrspace(1) %p2, ptr addrspace(1) %ptr2, ptr addrspace(1) %p4, ptr addrspace(1) %ptr4, ptr addrspace(1) %p8, ptr addrspace(1) %ptr8, ptr addrspace(1) %p16, ptr addrspace(1) %ptr16, ptr addrspace(1) %p32, ptr addrspace(1) %ptr32, ptr addrspace(1) %p64, ptr addrspace(1) %ptr64, ptr addrspace(1) %p128, ptr addrspace(1) %ptr128, ptr addrspace(1) %pu, ptr addrspace(1) %ptru) local_unnamed_addr #0 {
 816 entry:
 817   %tmp = addrspacecast ptr addrspace(1) %ptr1 to ptr
 818   %tmp1 = call i32 @__read_pipe_2(ptr addrspace(1) %p1, ptr %tmp, i32 1, i32 1) #0
 819   %tmp3 = addrspacecast ptr addrspace(1) %ptr2 to ptr
 820   %tmp4 = call i32 @__read_pipe_2(ptr addrspace(1) %p2, ptr %tmp3, i32 2, i32 2) #0
 821   %tmp6 = addrspacecast ptr addrspace(1) %ptr4 to ptr
 822   %tmp7 = call i32 @__read_pipe_2(ptr addrspace(1) %p4, ptr %tmp6, i32 4, i32 4) #0
 823   %tmp9 = addrspacecast ptr addrspace(1) %ptr8 to ptr
 824   %tmp10 = call i32 @__read_pipe_2(ptr addrspace(1) %p8, ptr %tmp9, i32 8, i32 8) #0
 825   %tmp12 = addrspacecast ptr addrspace(1) %ptr16 to ptr
 826   %tmp13 = call i32 @__read_pipe_2(ptr addrspace(1) %p16, ptr %tmp12, i32 16, i32 16) #0
 827   %tmp15 = addrspacecast ptr addrspace(1) %ptr32 to ptr
 828   %tmp16 = call i32 @__read_pipe_2(ptr addrspace(1) %p32, ptr %tmp15, i32 32, i32 32) #0
 829   %tmp18 = addrspacecast ptr addrspace(1) %ptr64 to ptr
 830   %tmp19 = call i32 @__read_pipe_2(ptr addrspace(1) %p64, ptr %tmp18, i32 64, i32 64) #0
 831   %tmp21 = addrspacecast ptr addrspace(1) %ptr128 to ptr
 832   %tmp22 = call i32 @__read_pipe_2(ptr addrspace(1) %p128, ptr %tmp21, i32 128, i32 128) #0
 833   %tmp24 = addrspacecast ptr addrspace(1) %ptru to ptr
 834   %tmp25 = call i32 @__read_pipe_2(ptr addrspace(1) %pu, ptr %tmp24, i32 400, i32 4) #0
 835   ret void
 836 }
 837
 838 ; GCN-PRELINK: declare float @_Z4cbrtf(float) local_unnamed_addr #[[$NOUNWIND_READONLY:[0-9]+]]
 839 ; GCN-PRELINK: declare float @_Z11native_sqrtf(float) local_unnamed_addr #[[$NOUNWIND_READONLY]]
 840
 841 ; GCN-PRELINK-DAG: attributes #[[$NOUNWIND]] = { nounwind }
 842 ; GCN-PRELINK-DAG: attributes #[[$NOUNWIND_READONLY]] = { nofree nounwind memory(read) }
 843 attributes #0 = { nounwind }