test/CodeGen/AMDGPU/simplify-libcalls.ll

   1 ; RUN: opt -S -O1 -mtriple=amdgcn-- -amdgpu-simplify-libcall < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GCN-POSTLINK %s
   2 ; RUN: opt -S -O1 -mtriple=amdgcn-- -amdgpu-simplify-libcall -amdgpu-prelink  <%s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GCN-PRELINK %s
   3 ; RUN: opt -S -O1 -mtriple=amdgcn-- -amdgpu-use-native -amdgpu-prelink < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GCN-NATIVE %s
   4
   5 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_sincos
   6 ; GCN-POSTLINK: tail call fast float @_Z3sinf(
   7 ; GCN-POSTLINK: tail call fast float @_Z3cosf(
   8 ; GCN-PRELINK: call fast float @_Z6sincosfPf(
   9 ; GCN-NATIVE: tail call fast float @_Z10native_sinf(
  10 ; GCN-NATIVE: tail call fast float @_Z10native_cosf(
  11 define amdgpu_kernel void @test_sincos(float addrspace(1)* nocapture %a) {
  12 entry:
  13   %tmp = load float, float addrspace(1)* %a, align 4
  14   %call = tail call fast float @_Z3sinf(float %tmp)
  15   store float %call, float addrspace(1)* %a, align 4
  16   %call2 = tail call fast float @_Z3cosf(float %tmp)
  17   %arrayidx3 = getelementptr inbounds float, float addrspace(1)* %a, i64 1
  18   store float %call2, float addrspace(1)* %arrayidx3, align 4
  19   ret void
  20 }
  21
  22 declare float @_Z3sinf(float)
  23
  24 declare float @_Z3cosf(float)
  25
  26 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_sincos_v2
  27 ; GCN-POSTLINK: tail call fast <2 x float> @_Z3sinDv2_f(
  28 ; GCN-POSTLINK: tail call fast <2 x float> @_Z3cosDv2_f(
  29 ; GCN-PRELINK: call fast <2 x float> @_Z6sincosDv2_fPS_(
  30 ; GCN-NATIVE: tail call fast <2 x float> @_Z10native_sinDv2_f(
  31 ; GCN-NATIVE: tail call fast <2 x float> @_Z10native_cosDv2_f(
  32 define amdgpu_kernel void @test_sincos_v2(<2 x float> addrspace(1)* nocapture %a) {
  33 entry:
  34   %tmp = load <2 x float>, <2 x float> addrspace(1)* %a, align 8
  35   %call = tail call fast <2 x float> @_Z3sinDv2_f(<2 x float> %tmp)
  36   store <2 x float> %call, <2 x float> addrspace(1)* %a, align 8
  37   %call2 = tail call fast <2 x float> @_Z3cosDv2_f(<2 x float> %tmp)
  38   %arrayidx3 = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i64 1
  39   store <2 x float> %call2, <2 x float> addrspace(1)* %arrayidx3, align 8
  40   ret void
  41 }
  42
  43 declare <2 x float> @_Z3sinDv2_f(<2 x float>)
  44
  45 declare <2 x float> @_Z3cosDv2_f(<2 x float>)
  46
  47 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_sincos_v3
  48 ; GCN-POSTLINK: tail call fast <3 x float> @_Z3sinDv3_f(
  49 ; GCN-POSTLINK: tail call fast <3 x float> @_Z3cosDv3_f(
  50 ; GCN-PRELINK: call fast <3 x float> @_Z6sincosDv3_fPS_(
  51 ; GCN-NATIVE: tail call fast <3 x float> @_Z10native_sinDv3_f(
  52 ; GCN-NATIVE: tail call fast <3 x float> @_Z10native_cosDv3_f(
  53 define amdgpu_kernel void @test_sincos_v3(<3 x float> addrspace(1)* nocapture %a) {
  54 entry:
  55   %castToVec4 = bitcast <3 x float> addrspace(1)* %a to <4 x float> addrspace(1)*
  56   %loadVec4 = load <4 x float>, <4 x float> addrspace(1)* %castToVec4, align 16
  57   %extractVec4 = shufflevector <4 x float> %loadVec4, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
  58   %call = tail call fast <3 x float> @_Z3sinDv3_f(<3 x float> %extractVec4)
  59   %extractVec6 = shufflevector <3 x float> %call, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
  60   store <4 x float> %extractVec6, <4 x float> addrspace(1)* %castToVec4, align 16
  61   %call11 = tail call fast <3 x float> @_Z3cosDv3_f(<3 x float> %extractVec4)
  62   %arrayidx12 = getelementptr inbounds <3 x float>, <3 x float> addrspace(1)* %a, i64 1
  63   %extractVec13 = shufflevector <3 x float> %call11, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
  64   %storetmp14 = bitcast <3 x float> addrspace(1)* %arrayidx12 to <4 x float> addrspace(1)*
  65   store <4 x float> %extractVec13, <4 x float> addrspace(1)* %storetmp14, align 16
  66   ret void
  67 }
  68
  69 declare <3 x float> @_Z3sinDv3_f(<3 x float>)
  70
  71 declare <3 x float> @_Z3cosDv3_f(<3 x float>)
  72
  73 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_sincos_v4
  74 ; GCN-POSTLINK: tail call fast <4 x float> @_Z3sinDv4_f(
  75 ; GCN-POSTLINK: tail call fast <4 x float> @_Z3cosDv4_f(
  76 ; GCN-PRELINK: call fast <4 x float> @_Z6sincosDv4_fPS_(
  77 ; GCN-NATIVE: tail call fast <4 x float> @_Z10native_sinDv4_f(
  78 ; GCN-NATIVE: tail call fast <4 x float> @_Z10native_cosDv4_f(
  79 define amdgpu_kernel void @test_sincos_v4(<4 x float> addrspace(1)* nocapture %a) {
  80 entry:
  81   %tmp = load <4 x float>, <4 x float> addrspace(1)* %a, align 16
  82   %call = tail call fast <4 x float> @_Z3sinDv4_f(<4 x float> %tmp)
  83   store <4 x float> %call, <4 x float> addrspace(1)* %a, align 16
  84   %call2 = tail call fast <4 x float> @_Z3cosDv4_f(<4 x float> %tmp)
  85   %arrayidx3 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %a, i64 1
  86   store <4 x float> %call2, <4 x float> addrspace(1)* %arrayidx3, align 16
  87   ret void
  88 }
  89
  90 declare <4 x float> @_Z3sinDv4_f(<4 x float>)
  91
  92 declare <4 x float> @_Z3cosDv4_f(<4 x float>)
  93
  94 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_sincos_v8
  95 ; GCN-POSTLINK: tail call fast <8 x float> @_Z3sinDv8_f(
  96 ; GCN-POSTLINK: tail call fast <8 x float> @_Z3cosDv8_f(
  97 ; GCN-PRELINK: call fast <8 x float> @_Z6sincosDv8_fPS_(
  98 ; GCN-NATIVE: tail call fast <8 x float> @_Z10native_sinDv8_f(
  99 ; GCN-NATIVE: tail call fast <8 x float> @_Z10native_cosDv8_f(
 100 define amdgpu_kernel void @test_sincos_v8(<8 x float> addrspace(1)* nocapture %a) {
 101 entry:
 102   %tmp = load <8 x float>, <8 x float> addrspace(1)* %a, align 32
 103   %call = tail call fast <8 x float> @_Z3sinDv8_f(<8 x float> %tmp)
 104   store <8 x float> %call, <8 x float> addrspace(1)* %a, align 32
 105   %call2 = tail call fast <8 x float> @_Z3cosDv8_f(<8 x float> %tmp)
 106   %arrayidx3 = getelementptr inbounds <8 x float>, <8 x float> addrspace(1)* %a, i64 1
 107   store <8 x float> %call2, <8 x float> addrspace(1)* %arrayidx3, align 32
 108   ret void
 109 }
 110
 111 declare <8 x float> @_Z3sinDv8_f(<8 x float>)
 112
 113 declare <8 x float> @_Z3cosDv8_f(<8 x float>)
 114
 115 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_sincos_v16
 116 ; GCN-POSTLINK: tail call fast <16 x float> @_Z3sinDv16_f(
 117 ; GCN-POSTLINK: tail call fast <16 x float> @_Z3cosDv16_f(
 118 ; GCN-PRELINK: call fast <16 x float> @_Z6sincosDv16_fPS_(
 119 ; GCN-NATIVE: tail call fast <16 x float> @_Z10native_sinDv16_f(
 120 ; GCN-NATIVE: tail call fast <16 x float> @_Z10native_cosDv16_f(
 121 define amdgpu_kernel void @test_sincos_v16(<16 x float> addrspace(1)* nocapture %a) {
 122 entry:
 123   %tmp = load <16 x float>, <16 x float> addrspace(1)* %a, align 64
 124   %call = tail call fast <16 x float> @_Z3sinDv16_f(<16 x float> %tmp)
 125   store <16 x float> %call, <16 x float> addrspace(1)* %a, align 64
 126   %call2 = tail call fast <16 x float> @_Z3cosDv16_f(<16 x float> %tmp)
 127   %arrayidx3 = getelementptr inbounds <16 x float>, <16 x float> addrspace(1)* %a, i64 1
 128   store <16 x float> %call2, <16 x float> addrspace(1)* %arrayidx3, align 64
 129   ret void
 130 }
 131
 132 declare <16 x float> @_Z3sinDv16_f(<16 x float>)
 133
 134 declare <16 x float> @_Z3cosDv16_f(<16 x float>)
 135
 136 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_native_recip
 137 ; GCN: store float 0x3FD5555560000000, float addrspace(1)* %a
 138 define amdgpu_kernel void @test_native_recip(float addrspace(1)* nocapture %a) {
 139 entry:
 140   %call = tail call fast float @_Z12native_recipf(float 3.000000e+00)
 141   store float %call, float addrspace(1)* %a, align 4
 142   ret void
 143 }
 144
 145 declare float @_Z12native_recipf(float)
 146
 147 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_half_recip
 148 ; GCN: store float 0x3FD5555560000000, float addrspace(1)* %a
 149 define amdgpu_kernel void @test_half_recip(float addrspace(1)* nocapture %a) {
 150 entry:
 151   %call = tail call fast float @_Z10half_recipf(float 3.000000e+00)
 152   store float %call, float addrspace(1)* %a, align 4
 153   ret void
 154 }
 155
 156 declare float @_Z10half_recipf(float)
 157
 158 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_native_divide
 159 ; GCN: fmul fast float %tmp, 0x3FD5555560000000
 160 define amdgpu_kernel void @test_native_divide(float addrspace(1)* nocapture %a) {
 161 entry:
 162   %tmp = load float, float addrspace(1)* %a, align 4
 163   %call = tail call fast float @_Z13native_divideff(float %tmp, float 3.000000e+00)
 164   store float %call, float addrspace(1)* %a, align 4
 165   ret void
 166 }
 167
 168 declare float @_Z13native_divideff(float, float)
 169
 170 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_half_divide
 171 ; GCN: fmul fast float %tmp, 0x3FD5555560000000
 172 define amdgpu_kernel void @test_half_divide(float addrspace(1)* nocapture %a) {
 173 entry:
 174   %tmp = load float, float addrspace(1)* %a, align 4
 175   %call = tail call fast float @_Z11half_divideff(float %tmp, float 3.000000e+00)
 176   store float %call, float addrspace(1)* %a, align 4
 177   ret void
 178 }
 179
 180 declare float @_Z11half_divideff(float, float)
 181
 182 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow_0f
 183 ; GCN: store float 1.000000e+00, float addrspace(1)* %a
 184 define amdgpu_kernel void @test_pow_0f(float addrspace(1)* nocapture %a) {
 185 entry:
 186   %tmp = load float, float addrspace(1)* %a, align 4
 187   %call = tail call fast float @_Z3powff(float %tmp, float 0.000000e+00)
 188   store float %call, float addrspace(1)* %a, align 4
 189   ret void
 190 }
 191
 192 declare float @_Z3powff(float, float)
 193
 194 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow_0i
 195 ; GCN: store float 1.000000e+00, float addrspace(1)* %a
 196 define amdgpu_kernel void @test_pow_0i(float addrspace(1)* nocapture %a) {
 197 entry:
 198   %tmp = load float, float addrspace(1)* %a, align 4
 199   %call = tail call fast float @_Z3powff(float %tmp, float 0.000000e+00)
 200   store float %call, float addrspace(1)* %a, align 4
 201   ret void
 202 }
 203
 204 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow_1f
 205 ; GCN: %tmp = load float, float addrspace(1)* %arrayidx, align 4
 206 ; GCN: store float %tmp, float addrspace(1)* %a, align 4
 207 define amdgpu_kernel void @test_pow_1f(float addrspace(1)* nocapture %a) {
 208 entry:
 209   %arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i64 1
 210   %tmp = load float, float addrspace(1)* %arrayidx, align 4
 211   %call = tail call fast float @_Z3powff(float %tmp, float 1.000000e+00)
 212   store float %call, float addrspace(1)* %a, align 4
 213   ret void
 214 }
 215
 216 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow_1i
 217 ; GCN: %tmp = load float, float addrspace(1)* %arrayidx, align 4
 218 ; GCN: store float %tmp, float addrspace(1)* %a, align 4
 219 define amdgpu_kernel void @test_pow_1i(float addrspace(1)* nocapture %a) {
 220 entry:
 221   %arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i64 1
 222   %tmp = load float, float addrspace(1)* %arrayidx, align 4
 223   %call = tail call fast float @_Z3powff(float %tmp, float 1.000000e+00)
 224   store float %call, float addrspace(1)* %a, align 4
 225   ret void
 226 }
 227
 228 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow_2f
 229 ; GCN: %tmp = load float, float addrspace(1)* %a, align 4
 230 ; GCN: %__pow2 = fmul fast float %tmp, %tmp
 231 define amdgpu_kernel void @test_pow_2f(float addrspace(1)* nocapture %a) {
 232 entry:
 233   %tmp = load float, float addrspace(1)* %a, align 4
 234   %call = tail call fast float @_Z3powff(float %tmp, float 2.000000e+00)
 235   store float %call, float addrspace(1)* %a, align 4
 236   ret void
 237 }
 238
 239 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow_2i
 240 ; GCN: %tmp = load float, float addrspace(1)* %a, align 4
 241 ; GCN: %__pow2 = fmul fast float %tmp, %tmp
 242 define amdgpu_kernel void @test_pow_2i(float addrspace(1)* nocapture %a) {
 243 entry:
 244   %tmp = load float, float addrspace(1)* %a, align 4
 245   %call = tail call fast float @_Z3powff(float %tmp, float 2.000000e+00)
 246   store float %call, float addrspace(1)* %a, align 4
 247   ret void
 248 }
 249
 250 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow_m1f
 251 ; GCN: %tmp = load float, float addrspace(1)* %arrayidx, align 4
 252 ; GCN: %__powrecip = fdiv fast float 1.000000e+00, %tmp
 253 define amdgpu_kernel void @test_pow_m1f(float addrspace(1)* nocapture %a) {
 254 entry:
 255   %arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i64 1
 256   %tmp = load float, float addrspace(1)* %arrayidx, align 4
 257   %call = tail call fast float @_Z3powff(float %tmp, float -1.000000e+00)
 258   store float %call, float addrspace(1)* %a, align 4
 259   ret void
 260 }
 261
 262 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow_m1i
 263 ; GCN: %tmp = load float, float addrspace(1)* %arrayidx, align 4
 264 ; GCN: %__powrecip = fdiv fast float 1.000000e+00, %tmp
 265 define amdgpu_kernel void @test_pow_m1i(float addrspace(1)* nocapture %a) {
 266 entry:
 267   %arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i64 1
 268   %tmp = load float, float addrspace(1)* %arrayidx, align 4
 269   %call = tail call fast float @_Z3powff(float %tmp, float -1.000000e+00)
 270   store float %call, float addrspace(1)* %a, align 4
 271   ret void
 272 }
 273
 274 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow_half
 275 ; GCN-POSTLINK: tail call fast float @_Z3powff(float %tmp, float 5.000000e-01)
 276 ; GCN-PRELINK: %__pow2sqrt = tail call fast float @_Z4sqrtf(float %tmp)
 277 define amdgpu_kernel void @test_pow_half(float addrspace(1)* nocapture %a) {
 278 entry:
 279   %arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i64 1
 280   %tmp = load float, float addrspace(1)* %arrayidx, align 4
 281   %call = tail call fast float @_Z3powff(float %tmp, float 5.000000e-01)
 282   store float %call, float addrspace(1)* %a, align 4
 283   ret void
 284 }
 285
 286 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow_mhalf
 287 ; GCN-POSTLINK: tail call fast float @_Z3powff(float %tmp, float -5.000000e-01)
 288 ; GCN-PRELINK: %__pow2rsqrt = tail call fast float @_Z5rsqrtf(float %tmp)
 289 define amdgpu_kernel void @test_pow_mhalf(float addrspace(1)* nocapture %a) {
 290 entry:
 291   %arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i64 1
 292   %tmp = load float, float addrspace(1)* %arrayidx, align 4
 293   %call = tail call fast float @_Z3powff(float %tmp, float -5.000000e-01)
 294   store float %call, float addrspace(1)* %a, align 4
 295   ret void
 296 }
 297
 298 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow_c
 299 ; GCN: %__powx2 = fmul fast float %tmp, %tmp
 300 ; GCN: %__powx21 = fmul fast float %__powx2, %__powx2
 301 ; GCN: %__powx22 = fmul fast float %__powx2, %tmp
 302 ; GCN: %[[r0:.*]] = fmul fast float %__powx21, %__powx21
 303 ; GCN: %__powprod3 = fmul fast float %[[r0]], %__powx22
 304 define amdgpu_kernel void @test_pow_c(float addrspace(1)* nocapture %a) {
 305 entry:
 306   %arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i64 1
 307   %tmp = load float, float addrspace(1)* %arrayidx, align 4
 308   %call = tail call fast float @_Z3powff(float %tmp, float 1.100000e+01)
 309   store float %call, float addrspace(1)* %a, align 4
 310   ret void
 311 }
 312
 313 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_powr_c
 314 ; GCN: %__powx2 = fmul fast float %tmp, %tmp
 315 ; GCN: %__powx21 = fmul fast float %__powx2, %__powx2
 316 ; GCN: %__powx22 = fmul fast float %__powx2, %tmp
 317 ; GCN: %[[r0:.*]] = fmul fast float %__powx21, %__powx21
 318 ; GCN: %__powprod3 = fmul fast float %[[r0]], %__powx22
 319 define amdgpu_kernel void @test_powr_c(float addrspace(1)* nocapture %a) {
 320 entry:
 321   %arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i64 1
 322   %tmp = load float, float addrspace(1)* %arrayidx, align 4
 323   %call = tail call fast float @_Z4powrff(float %tmp, float 1.100000e+01)
 324   store float %call, float addrspace(1)* %a, align 4
 325   ret void
 326 }
 327
 328 declare float @_Z4powrff(float, float)
 329
 330 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pown_c
 331 ; GCN: %__powx2 = fmul fast float %tmp, %tmp
 332 ; GCN: %__powx21 = fmul fast float %__powx2, %__powx2
 333 ; GCN: %__powx22 = fmul fast float %__powx2, %tmp
 334 ; GCN: %[[r0:.*]] = fmul fast float %__powx21, %__powx21
 335 ; GCN: %__powprod3 = fmul fast float %[[r0]], %__powx22
 336 define amdgpu_kernel void @test_pown_c(float addrspace(1)* nocapture %a) {
 337 entry:
 338   %arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i64 1
 339   %tmp = load float, float addrspace(1)* %arrayidx, align 4
 340   %call = tail call fast float @_Z4pownfi(float %tmp, i32 11)
 341   store float %call, float addrspace(1)* %a, align 4
 342   ret void
 343 }
 344
 345 declare float @_Z4pownfi(float, i32)
 346
 347 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow
 348 ; GCN-POSTLINK: tail call fast float @_Z3powff(float %tmp, float 1.013000e+03)
 349 ; GCN-PRELINK: %__fabs = tail call fast float @_Z4fabsf(float %tmp)
 350 ; GCN-PRELINK: %__log2 = tail call fast float @_Z4log2f(float %__fabs)
 351 ; GCN-PRELINK: %__ylogx = fmul fast float %__log2, 1.013000e+03
 352 ; GCN-PRELINK: %__exp2 = tail call fast float @_Z4exp2f(float %__ylogx)
 353 ; GCN-PRELINK: %[[r0:.*]] = bitcast float %tmp to i32
 354 ; GCN-PRELINK: %__pow_sign = and i32 %[[r0]], -2147483648
 355 ; GCN-PRELINK: %[[r1:.*]] = bitcast float %__exp2 to i32
 356 ; GCN-PRELINK: %[[r2:.*]] = or i32 %__pow_sign, %[[r1]]
 357 ; GCN-PRELINK: %[[r3:.*]] = bitcast float addrspace(1)* %a to i32 addrspace(1)*
 358 ; GCN-PRELINK: store i32 %[[r2]], i32 addrspace(1)* %[[r3]], align 4
 359 define amdgpu_kernel void @test_pow(float addrspace(1)* nocapture %a) {
 360 entry:
 361   %tmp = load float, float addrspace(1)* %a, align 4
 362   %call = tail call fast float @_Z3powff(float %tmp, float 1.013000e+03)
 363   store float %call, float addrspace(1)* %a, align 4
 364   ret void
 365 }
 366
 367 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_powr
 368 ; GCN-POSTLINK: tail call fast float @_Z4powrff(float %tmp, float %tmp1)
 369 ; GCN-PRELINK: %__log2 = tail call fast float @_Z4log2f(float %tmp)
 370 ; GCN-PRELINK: %__ylogx = fmul fast float %__log2, %tmp1
 371 ; GCN-PRELINK: %__exp2 = tail call fast float @_Z4exp2f(float %__ylogx)
 372 ; GCN-PRELINK: store float %__exp2, float addrspace(1)* %a, align 4
 373 ; GCN-NATIVE:  %__log2 = tail call fast float @_Z11native_log2f(float %tmp)
 374 ; GCN-NATIVE:  %__ylogx = fmul fast float %__log2, %tmp1
 375 ; GCN-NATIVE:  %__exp2 = tail call fast float @_Z11native_exp2f(float %__ylogx)
 376 ; GCN-NATIVE:  store float %__exp2, float addrspace(1)* %a, align 4
 377 define amdgpu_kernel void @test_powr(float addrspace(1)* nocapture %a) {
 378 entry:
 379   %tmp = load float, float addrspace(1)* %a, align 4
 380   %arrayidx1 = getelementptr inbounds float, float addrspace(1)* %a, i64 1
 381   %tmp1 = load float, float addrspace(1)* %arrayidx1, align 4
 382   %call = tail call fast float @_Z4powrff(float %tmp, float %tmp1)
 383   store float %call, float addrspace(1)* %a, align 4
 384   ret void
 385 }
 386
 387 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pown
 388 ; GCN-POSTLINK: tail call fast float @_Z4pownfi(float %tmp, i32 %conv)
 389 ; GCN-PRELINK: %conv = fptosi float %tmp1 to i32
 390 ; GCN-PRELINK: %__fabs = tail call fast float @_Z4fabsf(float %tmp)
 391 ; GCN-PRELINK: %__log2 = tail call fast float @_Z4log2f(float %__fabs)
 392 ; GCN-PRELINK: %pownI2F = sitofp i32 %conv to float
 393 ; GCN-PRELINK: %__ylogx = fmul fast float %__log2, %pownI2F
 394 ; GCN-PRELINK: %__exp2 = tail call fast float @_Z4exp2f(float %__ylogx)
 395 ; GCN-PRELINK: %__yeven = shl i32 %conv, 31
 396 ; GCN-PRELINK: %[[r0:.*]] = bitcast float %tmp to i32
 397 ; GCN-PRELINK: %__pow_sign = and i32 %__yeven, %[[r0]]
 398 ; GCN-PRELINK: %[[r1:.*]] = bitcast float %__exp2 to i32
 399 ; GCN-PRELINK: %[[r2:.*]] = or i32 %__pow_sign, %[[r1]]
 400 ; GCN-PRELINK: %[[r3:.*]] = bitcast float addrspace(1)* %a to i32 addrspace(1)*
 401 ; GCN-PRELINK: store i32 %[[r2]], i32 addrspace(1)* %[[r3]], align 4
 402 define amdgpu_kernel void @test_pown(float addrspace(1)* nocapture %a) {
 403 entry:
 404   %tmp = load float, float addrspace(1)* %a, align 4
 405   %arrayidx1 = getelementptr inbounds float, float addrspace(1)* %a, i64 1
 406   %tmp1 = load float, float addrspace(1)* %arrayidx1, align 4
 407   %conv = fptosi float %tmp1 to i32
 408   %call = tail call fast float @_Z4pownfi(float %tmp, i32 %conv)
 409   store float %call, float addrspace(1)* %a, align 4
 410   ret void
 411 }
 412
 413 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_rootn_1
 414 ; GCN: %tmp = load float, float addrspace(1)* %arrayidx, align 4
 415 ; GCN: store float %tmp, float addrspace(1)* %a, align 4
 416 define amdgpu_kernel void @test_rootn_1(float addrspace(1)* nocapture %a) {
 417 entry:
 418   %arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i64 1
 419   %tmp = load float, float addrspace(1)* %arrayidx, align 4
 420   %call = tail call fast float @_Z5rootnfi(float %tmp, i32 1)
 421   store float %call, float addrspace(1)* %a, align 4
 422   ret void
 423 }
 424
 425 declare float @_Z5rootnfi(float, i32)
 426
 427 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_rootn_2
 428 ; GCN-POSTLINK: tail call fast float @_Z5rootnfi(float %tmp, i32 2)
 429 ; GCN-PRELINK: %__rootn2sqrt = tail call fast float @_Z4sqrtf(float %tmp)
 430 define amdgpu_kernel void @test_rootn_2(float addrspace(1)* nocapture %a) {
 431 entry:
 432   %tmp = load float, float addrspace(1)* %a, align 4
 433   %call = tail call fast float @_Z5rootnfi(float %tmp, i32 2)
 434   store float %call, float addrspace(1)* %a, align 4
 435   ret void
 436 }
 437
 438 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_rootn_3
 439 ; GCN-POSTLINK: tail call fast float @_Z5rootnfi(float %tmp, i32 3)
 440 ; GCN-PRELINK: %__rootn2cbrt = tail call fast float @_Z4cbrtf(float %tmp)
 441 define amdgpu_kernel void @test_rootn_3(float addrspace(1)* nocapture %a) {
 442 entry:
 443   %tmp = load float, float addrspace(1)* %a, align 4
 444   %call = tail call fast float @_Z5rootnfi(float %tmp, i32 3)
 445   store float %call, float addrspace(1)* %a, align 4
 446   ret void
 447 }
 448
 449 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_rootn_m1
 450 ; GCN: fdiv fast float 1.000000e+00, %tmp
 451 define amdgpu_kernel void @test_rootn_m1(float addrspace(1)* nocapture %a) {
 452 entry:
 453   %tmp = load float, float addrspace(1)* %a, align 4
 454   %call = tail call fast float @_Z5rootnfi(float %tmp, i32 -1)
 455   store float %call, float addrspace(1)* %a, align 4
 456   ret void
 457 }
 458
 459 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_rootn_m2
 460 ; GCN-POSTLINK: tail call fast float @_Z5rootnfi(float %tmp, i32 -2)
 461 ; GCN-PRELINK: %__rootn2rsqrt = tail call fast float @_Z5rsqrtf(float %tmp)
 462 define amdgpu_kernel void @test_rootn_m2(float addrspace(1)* nocapture %a) {
 463 entry:
 464   %tmp = load float, float addrspace(1)* %a, align 4
 465   %call = tail call fast float @_Z5rootnfi(float %tmp, i32 -2)
 466   store float %call, float addrspace(1)* %a, align 4
 467   ret void
 468 }
 469
 470 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_fma_0x
 471 ; GCN: store float %y, float addrspace(1)* %a
 472 define amdgpu_kernel void @test_fma_0x(float addrspace(1)* nocapture %a, float %y) {
 473 entry:
 474   %tmp = load float, float addrspace(1)* %a, align 4
 475   %call = tail call fast float @_Z3fmafff(float 0.000000e+00, float %tmp, float %y)
 476   store float %call, float addrspace(1)* %a, align 4
 477   ret void
 478 }
 479
 480 declare float @_Z3fmafff(float, float, float)
 481
 482 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_fma_x0
 483 ; GCN: store float %y, float addrspace(1)* %a
 484 define amdgpu_kernel void @test_fma_x0(float addrspace(1)* nocapture %a, float %y) {
 485 entry:
 486   %tmp = load float, float addrspace(1)* %a, align 4
 487   %call = tail call fast float @_Z3fmafff(float %tmp, float 0.000000e+00, float %y)
 488   store float %call, float addrspace(1)* %a, align 4
 489   ret void
 490 }
 491
 492 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_mad_0x
 493 ; GCN: store float %y, float addrspace(1)* %a
 494 define amdgpu_kernel void @test_mad_0x(float addrspace(1)* nocapture %a, float %y) {
 495 entry:
 496   %tmp = load float, float addrspace(1)* %a, align 4
 497   %call = tail call fast float @_Z3madfff(float 0.000000e+00, float %tmp, float %y)
 498   store float %call, float addrspace(1)* %a, align 4
 499   ret void
 500 }
 501
 502 declare float @_Z3madfff(float, float, float)
 503
 504 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_mad_x0
 505 ; GCN: store float %y, float addrspace(1)* %a
 506 define amdgpu_kernel void @test_mad_x0(float addrspace(1)* nocapture %a, float %y) {
 507 entry:
 508   %tmp = load float, float addrspace(1)* %a, align 4
 509   %call = tail call fast float @_Z3madfff(float %tmp, float 0.000000e+00, float %y)
 510   store float %call, float addrspace(1)* %a, align 4
 511   ret void
 512 }
 513
 514 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_fma_x1y
 515 ; GCN: %fmaadd = fadd fast float %tmp, %y
 516 define amdgpu_kernel void @test_fma_x1y(float addrspace(1)* nocapture %a, float %y) {
 517 entry:
 518   %tmp = load float, float addrspace(1)* %a, align 4
 519   %call = tail call fast float @_Z3fmafff(float %tmp, float 1.000000e+00, float %y)
 520   store float %call, float addrspace(1)* %a, align 4
 521   ret void
 522 }
 523
 524 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_fma_1xy
 525 ; GCN: %fmaadd = fadd fast float %tmp, %y
 526 define amdgpu_kernel void @test_fma_1xy(float addrspace(1)* nocapture %a, float %y) {
 527 entry:
 528   %tmp = load float, float addrspace(1)* %a, align 4
 529   %call = tail call fast float @_Z3fmafff(float 1.000000e+00, float %tmp, float %y)
 530   store float %call, float addrspace(1)* %a, align 4
 531   ret void
 532 }
 533
 534 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_fma_xy0
 535 ; GCN: %fmamul = fmul fast float %tmp1, %tmp
 536 define amdgpu_kernel void @test_fma_xy0(float addrspace(1)* nocapture %a) {
 537 entry:
 538   %arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i64 1
 539   %tmp = load float, float addrspace(1)* %arrayidx, align 4
 540   %tmp1 = load float, float addrspace(1)* %a, align 4
 541   %call = tail call fast float @_Z3fmafff(float %tmp, float %tmp1, float 0.000000e+00)
 542   store float %call, float addrspace(1)* %a, align 4
 543   ret void
 544 }
 545
 546 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_exp
 547 ; GCN-NATIVE: tail call fast float @_Z10native_expf(float %tmp)
 548 define amdgpu_kernel void @test_use_native_exp(float addrspace(1)* nocapture %a) {
 549 entry:
 550   %tmp = load float, float addrspace(1)* %a, align 4
 551   %call = tail call fast float @_Z3expf(float %tmp)
 552   store float %call, float addrspace(1)* %a, align 4
 553   ret void
 554 }
 555
 556 declare float @_Z3expf(float)
 557
 558 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_exp2
 559 ; GCN-NATIVE: tail call fast float @_Z11native_exp2f(float %tmp)
 560 define amdgpu_kernel void @test_use_native_exp2(float addrspace(1)* nocapture %a) {
 561 entry:
 562   %tmp = load float, float addrspace(1)* %a, align 4
 563   %call = tail call fast float @_Z4exp2f(float %tmp)
 564   store float %call, float addrspace(1)* %a, align 4
 565   ret void
 566 }
 567
 568 declare float @_Z4exp2f(float)
 569
 570 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_exp10
 571 ; GCN-NATIVE: tail call fast float @_Z12native_exp10f(float %tmp)
 572 define amdgpu_kernel void @test_use_native_exp10(float addrspace(1)* nocapture %a) {
 573 entry:
 574   %tmp = load float, float addrspace(1)* %a, align 4
 575   %call = tail call fast float @_Z5exp10f(float %tmp)
 576   store float %call, float addrspace(1)* %a, align 4
 577   ret void
 578 }
 579
 580 declare float @_Z5exp10f(float)
 581
 582 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_log
 583 ; GCN-NATIVE: tail call fast float @_Z10native_logf(float %tmp)
 584 define amdgpu_kernel void @test_use_native_log(float addrspace(1)* nocapture %a) {
 585 entry:
 586   %tmp = load float, float addrspace(1)* %a, align 4
 587   %call = tail call fast float @_Z3logf(float %tmp)
 588   store float %call, float addrspace(1)* %a, align 4
 589   ret void
 590 }
 591
 592 declare float @_Z3logf(float)
 593
 594 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_log2
 595 ; GCN-NATIVE: tail call fast float @_Z11native_log2f(float %tmp)
 596 define amdgpu_kernel void @test_use_native_log2(float addrspace(1)* nocapture %a) {
 597 entry:
 598   %tmp = load float, float addrspace(1)* %a, align 4
 599   %call = tail call fast float @_Z4log2f(float %tmp)
 600   store float %call, float addrspace(1)* %a, align 4
 601   ret void
 602 }
 603
 604 declare float @_Z4log2f(float)
 605
 606 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_log10
 607 ; GCN-NATIVE: tail call fast float @_Z12native_log10f(float %tmp)
 608 define amdgpu_kernel void @test_use_native_log10(float addrspace(1)* nocapture %a) {
 609 entry:
 610   %tmp = load float, float addrspace(1)* %a, align 4
 611   %call = tail call fast float @_Z5log10f(float %tmp)
 612   store float %call, float addrspace(1)* %a, align 4
 613   ret void
 614 }
 615
 616 declare float @_Z5log10f(float)
 617
 618 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_powr
 619 ; GCN-NATIVE: %tmp1 = load float, float addrspace(1)* %arrayidx1, align 4
 620 ; GCN-NATIVE: %__log2 = tail call fast float @_Z11native_log2f(float %tmp)
 621 ; GCN-NATIVE: %__ylogx = fmul fast float %__log2, %tmp1
 622 ; GCN-NATIVE: %__exp2 = tail call fast float @_Z11native_exp2f(float %__ylogx)
 623 ; GCN-NATIVE: store float %__exp2, float addrspace(1)* %a, align 4
 624 define amdgpu_kernel void @test_use_native_powr(float addrspace(1)* nocapture %a) {
 625 entry:
 626   %tmp = load float, float addrspace(1)* %a, align 4
 627   %arrayidx1 = getelementptr inbounds float, float addrspace(1)* %a, i64 1
 628   %tmp1 = load float, float addrspace(1)* %arrayidx1, align 4
 629   %call = tail call fast float @_Z4powrff(float %tmp, float %tmp1)
 630   store float %call, float addrspace(1)* %a, align 4
 631   ret void
 632 }
 633
 634 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_sqrt
 635 ; GCN-NATIVE: tail call fast float @_Z11native_sqrtf(float %tmp)
 636 define amdgpu_kernel void @test_use_native_sqrt(float addrspace(1)* nocapture %a) {
 637 entry:
 638   %tmp = load float, float addrspace(1)* %a, align 4
 639   %call = tail call fast float @_Z4sqrtf(float %tmp)
 640   store float %call, float addrspace(1)* %a, align 4
 641   ret void
 642 }
 643
 644 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_dont_use_native_sqrt_fast_f64
 645 ; GCN: tail call fast double @_Z4sqrtd(double %tmp)
 646 define amdgpu_kernel void @test_dont_use_native_sqrt_fast_f64(double addrspace(1)* nocapture %a) {
 647 entry:
 648   %tmp = load double, double addrspace(1)* %a, align 8
 649   %call = tail call fast double @_Z4sqrtd(double %tmp)
 650   store double %call, double addrspace(1)* %a, align 8
 651   ret void
 652 }
 653
 654 declare float @_Z4sqrtf(float)
 655 declare double @_Z4sqrtd(double)
 656
 657 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_rsqrt
 658 ; GCN-NATIVE: tail call fast float @_Z12native_rsqrtf(float %tmp)
 659 define amdgpu_kernel void @test_use_native_rsqrt(float addrspace(1)* nocapture %a) {
 660 entry:
 661   %tmp = load float, float addrspace(1)* %a, align 4
 662   %call = tail call fast float @_Z5rsqrtf(float %tmp)
 663   store float %call, float addrspace(1)* %a, align 4
 664   ret void
 665 }
 666
 667 declare float @_Z5rsqrtf(float)
 668
 669 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_tan
 670 ; GCN-NATIVE: tail call fast float @_Z10native_tanf(float %tmp)
 671 define amdgpu_kernel void @test_use_native_tan(float addrspace(1)* nocapture %a) {
 672 entry:
 673   %tmp = load float, float addrspace(1)* %a, align 4
 674   %call = tail call fast float @_Z3tanf(float %tmp)
 675   store float %call, float addrspace(1)* %a, align 4
 676   ret void
 677 }
 678
 679 declare float @_Z3tanf(float)
 680
 681 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_sincos
 682 ; GCN-NATIVE: tail call float @_Z10native_sinf(float %tmp)
 683 ; GCN-NATIVE: tail call float @_Z10native_cosf(float %tmp)
 684 define amdgpu_kernel void @test_use_native_sincos(float addrspace(1)* %a) {
 685 entry:
 686   %tmp = load float, float addrspace(1)* %a, align 4
 687   %arrayidx1 = getelementptr inbounds float, float addrspace(1)* %a, i64 1
 688   %tmp1 = addrspacecast float addrspace(1)* %arrayidx1 to float*
 689   %call = tail call fast float @_Z6sincosfPf(float %tmp, float* %tmp1)
 690   store float %call, float addrspace(1)* %a, align 4
 691   ret void
 692 }
 693
 694 declare float @_Z6sincosfPf(float, float*)
 695
 696 %opencl.pipe_t = type opaque
 697 %opencl.reserve_id_t = type opaque
 698
 699 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_read_pipe(%opencl.pipe_t addrspace(1)* %p, i32 addrspace(1)* %ptr)
 700 ; GCN-PRELINK: call i32 @__read_pipe_2_4(%opencl.pipe_t addrspace(1)* %{{.*}}, i32* %{{.*}}) #[[$NOUNWIND:[0-9]+]]
 701 ; GCN-PRELINK: call i32 @__read_pipe_4_4(%opencl.pipe_t addrspace(1)* %{{.*}}, %opencl.reserve_id_t addrspace(5)* %{{.*}}, i32 2, i32* %{{.*}}) #[[$NOUNWIND]]
 702 define amdgpu_kernel void @test_read_pipe(%opencl.pipe_t addrspace(1)* %p, i32 addrspace(1)* %ptr) local_unnamed_addr {
 703 entry:
 704   %tmp = bitcast i32 addrspace(1)* %ptr to i8 addrspace(1)*
 705   %tmp1 = addrspacecast i8 addrspace(1)* %tmp to i8*
 706   %tmp2 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p, i8* %tmp1, i32 4, i32 4) #0
 707   %tmp3 = tail call %opencl.reserve_id_t addrspace(5)* @__reserve_read_pipe(%opencl.pipe_t addrspace(1)* %p, i32 2, i32 4, i32 4)
 708   %tmp4 = tail call i32 @__read_pipe_4(%opencl.pipe_t addrspace(1)* %p, %opencl.reserve_id_t addrspace(5)* %tmp3, i32 2, i8* %tmp1, i32 4, i32 4) #0
 709   tail call void @__commit_read_pipe(%opencl.pipe_t addrspace(1)* %p, %opencl.reserve_id_t addrspace(5)* %tmp3, i32 4, i32 4)
 710   ret void
 711 }
 712
 713 declare i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)*, i8*, i32, i32)
 714
 715 declare %opencl.reserve_id_t addrspace(5)* @__reserve_read_pipe(%opencl.pipe_t addrspace(1)*, i32, i32, i32)
 716
 717 declare i32 @__read_pipe_4(%opencl.pipe_t addrspace(1)*, %opencl.reserve_id_t addrspace(5)*, i32, i8*, i32, i32)
 718
 719 declare void @__commit_read_pipe(%opencl.pipe_t addrspace(1)*, %opencl.reserve_id_t addrspace(5)*, i32, i32)
 720
 721 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_write_pipe(%opencl.pipe_t addrspace(1)* %p, i32 addrspace(1)* %ptr)
 722 ; GCN-PRELINK: call i32 @__write_pipe_2_4(%opencl.pipe_t addrspace(1)* %{{.*}}, i32* %{{.*}}) #[[$NOUNWIND]]
 723 ; GCN-PRELINK: call i32 @__write_pipe_4_4(%opencl.pipe_t addrspace(1)* %{{.*}}, %opencl.reserve_id_t addrspace(5)* %{{.*}}, i32 2, i32* %{{.*}}) #[[$NOUNWIND]]
 724 define amdgpu_kernel void @test_write_pipe(%opencl.pipe_t addrspace(1)* %p, i32 addrspace(1)* %ptr) local_unnamed_addr {
 725 entry:
 726   %tmp = bitcast i32 addrspace(1)* %ptr to i8 addrspace(1)*
 727   %tmp1 = addrspacecast i8 addrspace(1)* %tmp to i8*
 728   %tmp2 = tail call i32 @__write_pipe_2(%opencl.pipe_t addrspace(1)* %p, i8* %tmp1, i32 4, i32 4) #0
 729   %tmp3 = tail call %opencl.reserve_id_t addrspace(5)* @__reserve_write_pipe(%opencl.pipe_t addrspace(1)* %p, i32 2, i32 4, i32 4) #0
 730   %tmp4 = tail call i32 @__write_pipe_4(%opencl.pipe_t addrspace(1)* %p, %opencl.reserve_id_t addrspace(5)* %tmp3, i32 2, i8* %tmp1, i32 4, i32 4) #0
 731   tail call void @__commit_write_pipe(%opencl.pipe_t addrspace(1)* %p, %opencl.reserve_id_t addrspace(5)* %tmp3, i32 4, i32 4) #0
 732   ret void
 733 }
 734
 735 declare i32 @__write_pipe_2(%opencl.pipe_t addrspace(1)*, i8*, i32, i32) local_unnamed_addr
 736
 737 declare %opencl.reserve_id_t addrspace(5)* @__reserve_write_pipe(%opencl.pipe_t addrspace(1)*, i32, i32, i32) local_unnamed_addr
 738
 739 declare i32 @__write_pipe_4(%opencl.pipe_t addrspace(1)*, %opencl.reserve_id_t addrspace(5)*, i32, i8*, i32, i32) local_unnamed_addr
 740
 741 declare void @__commit_write_pipe(%opencl.pipe_t addrspace(1)*, %opencl.reserve_id_t addrspace(5)*, i32, i32) local_unnamed_addr
 742
 743 %struct.S = type { [100 x i32] }
 744
 745 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pipe_size
 746 ; GCN-PRELINK: call i32 @__read_pipe_2_1(%opencl.pipe_t addrspace(1)* %{{.*}} i8* %{{.*}}) #[[$NOUNWIND]]
 747 ; GCN-PRELINK: call i32 @__read_pipe_2_2(%opencl.pipe_t addrspace(1)* %{{.*}} i16* %{{.*}}) #[[$NOUNWIND]]
 748 ; GCN-PRELINK: call i32 @__read_pipe_2_4(%opencl.pipe_t addrspace(1)* %{{.*}} i32* %{{.*}}) #[[$NOUNWIND]]
 749 ; GCN-PRELINK: call i32 @__read_pipe_2_8(%opencl.pipe_t addrspace(1)* %{{.*}} i64* %{{.*}}) #[[$NOUNWIND]]
 750 ; GCN-PRELINK: call i32 @__read_pipe_2_16(%opencl.pipe_t addrspace(1)* %{{.*}}, <2 x i64>* %{{.*}}) #[[$NOUNWIND]]
 751 ; GCN-PRELINK: call i32 @__read_pipe_2_32(%opencl.pipe_t addrspace(1)* %{{.*}}, <4 x i64>* %{{.*}} #[[$NOUNWIND]]
 752 ; GCN-PRELINK: call i32 @__read_pipe_2_64(%opencl.pipe_t addrspace(1)* %{{.*}}, <8 x i64>* %{{.*}} #[[$NOUNWIND]]
 753 ; GCN-PRELINK: call i32 @__read_pipe_2_128(%opencl.pipe_t addrspace(1)* %{{.*}}, <16 x i64>* %{{.*}} #[[$NOUNWIND]]
 754 ; GCN-PRELINK: call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %{{.*}}, i8* %{{.*}} i32 400, i32 4) #[[$NOUNWIND]]
 755 define amdgpu_kernel void @test_pipe_size(%opencl.pipe_t addrspace(1)* %p1, i8 addrspace(1)* %ptr1, %opencl.pipe_t addrspace(1)* %p2, i16 addrspace(1)* %ptr2, %opencl.pipe_t addrspace(1)* %p4, i32 addrspace(1)* %ptr4, %opencl.pipe_t addrspace(1)* %p8, i64 addrspace(1)* %ptr8, %opencl.pipe_t addrspace(1)* %p16, <2 x i64> addrspace(1)* %ptr16, %opencl.pipe_t addrspace(1)* %p32, <4 x i64> addrspace(1)* %ptr32, %opencl.pipe_t addrspace(1)* %p64, <8 x i64> addrspace(1)* %ptr64, %opencl.pipe_t addrspace(1)* %p128, <16 x i64> addrspace(1)* %ptr128, %opencl.pipe_t addrspace(1)* %pu, %struct.S addrspace(1)* %ptru) local_unnamed_addr #0 {
 756 entry:
 757   %tmp = addrspacecast i8 addrspace(1)* %ptr1 to i8*
 758   %tmp1 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p1, i8* %tmp, i32 1, i32 1) #0
 759   %tmp2 = bitcast i16 addrspace(1)* %ptr2 to i8 addrspace(1)*
 760   %tmp3 = addrspacecast i8 addrspace(1)* %tmp2 to i8*
 761   %tmp4 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p2, i8* %tmp3, i32 2, i32 2) #0
 762   %tmp5 = bitcast i32 addrspace(1)* %ptr4 to i8 addrspace(1)*
 763   %tmp6 = addrspacecast i8 addrspace(1)* %tmp5 to i8*
 764   %tmp7 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p4, i8* %tmp6, i32 4, i32 4) #0
 765   %tmp8 = bitcast i64 addrspace(1)* %ptr8 to i8 addrspace(1)*
 766   %tmp9 = addrspacecast i8 addrspace(1)* %tmp8 to i8*
 767   %tmp10 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p8, i8* %tmp9, i32 8, i32 8) #0
 768   %tmp11 = bitcast <2 x i64> addrspace(1)* %ptr16 to i8 addrspace(1)*
 769   %tmp12 = addrspacecast i8 addrspace(1)* %tmp11 to i8*
 770   %tmp13 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p16, i8* %tmp12, i32 16, i32 16) #0
 771   %tmp14 = bitcast <4 x i64> addrspace(1)* %ptr32 to i8 addrspace(1)*
 772   %tmp15 = addrspacecast i8 addrspace(1)* %tmp14 to i8*
 773   %tmp16 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p32, i8* %tmp15, i32 32, i32 32) #0
 774   %tmp17 = bitcast <8 x i64> addrspace(1)* %ptr64 to i8 addrspace(1)*
 775   %tmp18 = addrspacecast i8 addrspace(1)* %tmp17 to i8*
 776   %tmp19 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p64, i8* %tmp18, i32 64, i32 64) #0
 777   %tmp20 = bitcast <16 x i64> addrspace(1)* %ptr128 to i8 addrspace(1)*
 778   %tmp21 = addrspacecast i8 addrspace(1)* %tmp20 to i8*
 779   %tmp22 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p128, i8* %tmp21, i32 128, i32 128) #0
 780   %tmp23 = bitcast %struct.S addrspace(1)* %ptru to i8 addrspace(1)*
 781   %tmp24 = addrspacecast i8 addrspace(1)* %tmp23 to i8*
 782   %tmp25 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %pu, i8* %tmp24, i32 400, i32 4) #0
 783   ret void
 784 }
 785
 786 ; GCN-PRELINK: declare float @_Z4fabsf(float) local_unnamed_addr #[[$NOUNWIND_READONLY:[0-9]+]]
 787 ; GCN-PRELINK: declare float @_Z4cbrtf(float) local_unnamed_addr #[[$NOUNWIND_READONLY]]
 788 ; GCN-PRELINK: declare float @_Z11native_sqrtf(float) local_unnamed_addr #[[$NOUNWIND_READONLY]]
 789
 790 ; CGN-PRELINK: attributes #[[$NOUNWIND]] = { nounwind }
 791 ; GCN-PRELINK: attributes #[[$NOUNWIND_READONLY]] = { nounwind readonly }
 792 attributes #0 = { nounwind }