test/Analysis/CostModel/AMDGPU/fdiv.ll

   1 ; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=hawaii -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,CIFASTF64,NOFP32DENORM,NOFP16,NOFP16-NOFP32DENORM %s
   2 ; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,CISLOWF64,NOFP32DENORM,NOFP16,NOFP16-NOFP32DENORM  %s
   3 ; RUN: opt -cost-model -analyze -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,SIFASTF64,NOFP32DENORM,NOFP16,NOFP16-NOFP32DENORM  %s
   4 ; RUN: opt -cost-model -analyze -mtriple=amdgcn-mesa-mesa3d -mcpu=verde -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,SISLOWF64,NOFP32DENORM,NOFP16,NOFP16-NOFP32DENORM  %s
   5 ; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=hawaii -mattr=+fp32-denormals < %s | FileCheck -check-prefixes=ALL,FP32DENORMS,SLOWFP32DENORMS,NOFP16,NOFP16-FP32DENORM %s
   6 ; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+fp32-denormals < %s | FileCheck -check-prefixes=ALL,FP32DENORMS,FASTFP32DENORMS,FP16 %s
   7
   8 ; ALL: 'fdiv_f32'
   9 ; NOFP32DENORM: estimated cost of 12 for {{.*}} fdiv float
  10 ; FP32DENORMS: estimated cost of 10 for {{.*}} fdiv float
  11 define amdgpu_kernel void @fdiv_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr, float %b) #0 {
  12   %vec = load float, float addrspace(1)* %vaddr
  13   %add = fdiv float %vec, %b
  14   store float %add, float addrspace(1)* %out
  15   ret void
  16 }
  17
  18 ; ALL: 'fdiv_v2f32'
  19 ; NOFP32DENORM: estimated cost of 24 for {{.*}} fdiv <2 x float>
  20 ; FP32DENORMS: estimated cost of 20 for {{.*}} fdiv <2 x float>
  21 define amdgpu_kernel void @fdiv_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr, <2 x float> %b) #0 {
  22   %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr
  23   %add = fdiv <2 x float> %vec, %b
  24   store <2 x float> %add, <2 x float> addrspace(1)* %out
  25   ret void
  26 }
  27
  28 ; ALL: 'fdiv_v3f32'
  29 ; Allow for 48/40 when v3f32 is illegal and TargetLowering thinks it needs widening,
  30 ; and 36/30 when it is legal.
  31 ; NOFP32DENORM: estimated cost of {{36|48}} for {{.*}} fdiv <3 x float>
  32 ; FP32DENORMS: estimated cost of {{30|40}} for {{.*}} fdiv <3 x float>
  33 define amdgpu_kernel void @fdiv_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 {
  34   %vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr
  35   %add = fdiv <3 x float> %vec, %b
  36   store <3 x float> %add, <3 x float> addrspace(1)* %out
  37   ret void
  38 }
  39
  40 ; ALL: 'fdiv_v5f32'
  41 ; Allow for 96/80 when v5f32 is illegal and TargetLowering thinks it needs widening,
  42 ; and 60/50 when it is legal.
  43 ; NOFP32DENORM: estimated cost of {{96|60}} for {{.*}} fdiv <5 x float>
  44 ; FP32DENORMS: estimated cost of {{80|50}} for {{.*}} fdiv <5 x float>
  45 define amdgpu_kernel void @fdiv_v5f32(<5 x float> addrspace(1)* %out, <5 x float> addrspace(1)* %vaddr, <5 x float> %b) #0 {
  46   %vec = load <5 x float>, <5 x float> addrspace(1)* %vaddr
  47   %add = fdiv <5 x float> %vec, %b
  48   store <5 x float> %add, <5 x float> addrspace(1)* %out
  49   ret void
  50 }
  51
  52 ; ALL: 'fdiv_f64'
  53 ; CIFASTF64: estimated cost of 29 for {{.*}} fdiv double
  54 ; CISLOWF64: estimated cost of 33 for {{.*}} fdiv double
  55 ; SIFASTF64: estimated cost of 32 for {{.*}} fdiv double
  56 ; SISLOWF64: estimated cost of 36 for {{.*}} fdiv double
  57 define amdgpu_kernel void @fdiv_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, double %b) #0 {
  58   %vec = load double, double addrspace(1)* %vaddr
  59   %add = fdiv double %vec, %b
  60   store double %add, double addrspace(1)* %out
  61   ret void
  62 }
  63
  64 ; ALL: 'fdiv_v2f64'
  65 ; CIFASTF64: estimated cost of 58 for {{.*}} fdiv <2 x double>
  66 ; CISLOWF64: estimated cost of 66 for {{.*}} fdiv <2 x double>
  67 ; SIFASTF64: estimated cost of 64 for {{.*}} fdiv <2 x double>
  68 ; SISLOWF64: estimated cost of 72 for {{.*}} fdiv <2 x double>
  69 define amdgpu_kernel void @fdiv_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr, <2 x double> %b) #0 {
  70   %vec = load <2 x double>, <2 x double> addrspace(1)* %vaddr
  71   %add = fdiv <2 x double> %vec, %b
  72   store <2 x double> %add, <2 x double> addrspace(1)* %out
  73   ret void
  74 }
  75
  76 ; ALL: 'fdiv_v3f64'
  77 ; CIFASTF64: estimated cost of 87 for {{.*}} fdiv <3 x double>
  78 ; CISLOWF64: estimated cost of 99 for {{.*}} fdiv <3 x double>
  79 ; SIFASTF64: estimated cost of 96 for {{.*}} fdiv <3 x double>
  80 ; SISLOWF64: estimated cost of 108 for {{.*}} fdiv <3 x double>
  81 define amdgpu_kernel void @fdiv_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr, <3 x double> %b) #0 {
  82   %vec = load <3 x double>, <3 x double> addrspace(1)* %vaddr
  83   %add = fdiv <3 x double> %vec, %b
  84   store <3 x double> %add, <3 x double> addrspace(1)* %out
  85   ret void
  86 }
  87
  88 ; ALL: 'fdiv_f16'
  89 ; NOFP16-NOFP32DENORM: estimated cost of 12 for {{.*}} fdiv half
  90 ; NOFP16-FP32DENORM: estimated cost of 10 for {{.*}} fdiv half
  91 ; FP16: estimated cost of 10 for {{.*}} fdiv half
  92 define amdgpu_kernel void @fdiv_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %b) #0 {
  93   %vec = load half, half addrspace(1)* %vaddr
  94   %add = fdiv half %vec, %b
  95   store half %add, half addrspace(1)* %out
  96   ret void
  97 }
  98
  99 ; ALL: 'fdiv_v2f16'
 100 ; NOFP16-NOFP32DENORM: estimated cost of 24 for {{.*}} fdiv <2 x half>
 101 ; NOFP16-FP32DENORM: estimated cost of 20 for {{.*}} fdiv <2 x half>
 102 ; FP16: estimated cost of 20 for {{.*}} fdiv <2 x half>
 103 define amdgpu_kernel void @fdiv_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr, <2 x half> %b) #0 {
 104   %vec = load <2 x half>, <2 x half> addrspace(1)* %vaddr
 105   %add = fdiv <2 x half> %vec, %b
 106   store <2 x half> %add, <2 x half> addrspace(1)* %out
 107   ret void
 108 }
 109
 110 ; ALL: 'fdiv_v4f16'
 111 ; NOFP16-NOFP32DENORM: estimated cost of 48 for {{.*}} fdiv <4 x half>
 112 ; NOFP16-FP32DENORM: estimated cost of 40 for {{.*}} fdiv <4 x half>
 113 ; FP16: estimated cost of 40 for {{.*}} fdiv <4 x half>
 114 define amdgpu_kernel void @fdiv_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %vaddr, <4 x half> %b) #0 {
 115   %vec = load <4 x half>, <4 x half> addrspace(1)* %vaddr
 116   %add = fdiv <4 x half> %vec, %b
 117   store <4 x half> %add, <4 x half> addrspace(1)* %out
 118   ret void
 119 }
 120
 121 ; ALL: 'rcp_f32'
 122 ; NOFP32DENORM: estimated cost of 3 for {{.*}} fdiv float
 123 ; SLOWFP32DENORMS: estimated cost of 10 for {{.*}} fdiv float
 124 ; FASTFP32DENORMS: estimated cost of 10 for {{.*}} fdiv float
 125 define amdgpu_kernel void @rcp_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr) #0 {
 126   %vec = load float, float addrspace(1)* %vaddr
 127   %add = fdiv float 1.0, %vec
 128   store float %add, float addrspace(1)* %out
 129   ret void
 130 }
 131
 132 ; ALL: 'rcp_f16'
 133 ; NOFP16-NOFP32DENORM: estimated cost of 3 for {{.*}} fdiv half
 134 ; NOFP16-FP32DENORM: estimated cost of 10 for {{.*}} fdiv half
 135 ; FP16: estimated cost of 3 for {{.*}} fdiv half
 136 define amdgpu_kernel void @rcp_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr) #0 {
 137   %vec = load half, half addrspace(1)* %vaddr
 138   %add = fdiv half 1.0, %vec
 139   store half %add, half addrspace(1)* %out
 140   ret void
 141 }
 142
 143 ; ALL: 'rcp_f64'
 144 ; CIFASTF64: estimated cost of 29 for {{.*}} fdiv double
 145 ; CISLOWF64: estimated cost of 33 for {{.*}} fdiv double
 146 ; SIFASTF64: estimated cost of 32 for {{.*}} fdiv double
 147 ; SISLOWF64: estimated cost of 36 for {{.*}} fdiv double
 148 define amdgpu_kernel void @rcp_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr) #0 {
 149   %vec = load double, double addrspace(1)* %vaddr
 150   %add = fdiv double 1.0, %vec
 151   store double %add, double addrspace(1)* %out
 152   ret void
 153 }
 154
 155 ; ALL: 'rcp_v2f32'
 156 ; NOFP32DENORM: estimated cost of 6 for {{.*}} fdiv <2 x float>
 157 ; SLOWFP32DENORMS: estimated cost of 20 for {{.*}} fdiv <2 x float>
 158 ; FASTFP32DENORMS: estimated cost of 20 for {{.*}} fdiv <2 x float>
 159 define amdgpu_kernel void @rcp_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr) #0 {
 160   %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr
 161   %add = fdiv <2 x float> <float 1.0, float 1.0>, %vec
 162   store <2 x float> %add, <2 x float> addrspace(1)* %out
 163   ret void
 164 }
 165
 166 ; ALL: 'rcp_v2f16'
 167 ; NOFP16-NOFP32DENORM: estimated cost of 6 for {{.*}} fdiv <2 x half>
 168 ; NOFP16-FP32DENORM: estimated cost of 20 for {{.*}} fdiv <2 x half>
 169 ; FP16: estimated cost of 6 for {{.*}} fdiv <2 x half>
 170 define amdgpu_kernel void @rcp_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr) #0 {
 171   %vec = load <2 x half>, <2 x half> addrspace(1)* %vaddr
 172   %add = fdiv <2 x half> <half 1.0, half 1.0>, %vec
 173   store <2 x half> %add, <2 x half> addrspace(1)* %out
 174   ret void
 175 }
 176
 177 attributes #0 = { nounwind }