test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll

   1 ; XUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
   2 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=VI-DENORM %s
   3 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=VI-FLUSH %s
   4
   5
   6 ; Make sure (fmul (fadd x, x), c) -> (fmul x, (fmul 2.0, c)) doesn't
   7 ; make add an instruction if the fadd has more than one use.
   8
   9 declare half @llvm.fabs.f16(half) #1
  10 declare float @llvm.fabs.f32(float) #1
  11
  12 ; GCN-LABEL: {{^}}multiple_fadd_use_test_f32:
  13 ; SI: v_max_legacy_f32_e64 [[A16:v[0-9]+]],
  14 ; SI: v_add_f32_e32 [[A17:v[0-9]+]], [[A16]], [[A16]]
  15 ; SI: v_mul_f32_e32 [[A18:v[0-9]+]], [[A17]], [[A17]]
  16 ; SI: v_mad_f32 [[A20:v[0-9]+]], -[[A18]], [[A17]], 1.0
  17 ; SI: buffer_store_dword [[A20]]
  18
  19 ; VI: v_add_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, -1.0
  20 ; VI: v_add_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, -1.0
  21 ; VI: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, |v{{[0-9]+}}|
  22 ; VI: v_cndmask_b32_e32
  23 ; VI: v_add_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}|, |v{{[0-9]+}}|
  24 ; VI: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
  25 ; VI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, 1.0
  26 define amdgpu_kernel void @multiple_fadd_use_test_f32(float addrspace(1)* %out, float %x, float %y, float %z) #0 {
  27   %a11 = fadd fast float %y, -1.0
  28   %a12 = call float @llvm.fabs.f32(float %a11)
  29   %a13 = fadd fast float %x, -1.0
  30   %a14 = call float @llvm.fabs.f32(float %a13)
  31   %a15 = fcmp ogt float %a12, %a14
  32   %a16 = select i1 %a15, float %a12, float %a14
  33   %a17 = fmul fast float %a16, 2.0
  34   %a18 = fmul fast float %a17, %a17
  35   %a19 = fmul fast float %a18, %a17
  36   %a20 = fsub fast float 1.0, %a19
  37   store float %a20, float addrspace(1)* %out
  38   ret void
  39 }
  40
  41 ; GCN-LABEL: {{^}}multiple_use_fadd_fmac_f32:
  42 ; GCN-DAG: v_add_f32_e64 [[MUL2:v[0-9]+]], [[X:s[0-9]+]], s{{[0-9]+}}
  43 ; GCN-DAG: v_mac_f32_e64 [[MAD:v[0-9]+]], [[X]], 2.0
  44 ; GCN-DAG: buffer_store_dword [[MUL2]]
  45 ; GCN-DAG: buffer_store_dword [[MAD]]
  46 ; GCN: s_endpgm
  47 define amdgpu_kernel void @multiple_use_fadd_fmac_f32(float addrspace(1)* %out, float %x, float %y) #0 {
  48   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
  49   %mul2 = fmul fast float %x, 2.0
  50   %mad = fadd fast float %mul2, %y
  51   store volatile float %mul2, float addrspace(1)* %out
  52   store volatile float %mad, float addrspace(1)* %out.gep.1
  53   ret void
  54 }
  55
  56 ; GCN-LABEL: {{^}}multiple_use_fadd_fmad_f32:
  57 ; GCN-DAG: v_add_f32_e64 [[MUL2:v[0-9]+]], |[[X:s[0-9]+]]|, |s{{[0-9]+}}|
  58 ; GCN-DAG: v_mad_f32 [[MAD:v[0-9]+]], |[[X]]|, 2.0, v{{[0-9]+}}
  59 ; GCN-DAG: buffer_store_dword [[MUL2]]
  60 ; GCN-DAG: buffer_store_dword [[MAD]]
  61 ; GCN: s_endpgm
  62 define amdgpu_kernel void @multiple_use_fadd_fmad_f32(float addrspace(1)* %out, float %x, float %y) #0 {
  63   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
  64   %x.abs = call float @llvm.fabs.f32(float %x)
  65   %mul2 = fmul fast float %x.abs, 2.0
  66   %mad = fadd fast float %mul2, %y
  67   store volatile float %mul2, float addrspace(1)* %out
  68   store volatile float %mad, float addrspace(1)* %out.gep.1
  69   ret void
  70 }
  71
  72 ; GCN-LABEL: {{^}}multiple_use_fadd_multi_fmad_f32:
  73 ; GCN: v_mad_f32 {{v[0-9]+}}, |[[X:s[0-9]+]]|, 2.0, v{{[0-9]+}}
  74 ; GCN: v_mad_f32 {{v[0-9]+}}, |[[X]]|, 2.0, v{{[0-9]+}}
  75 define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f32(float addrspace(1)* %out, float %x, float %y, float %z) #0 {
  76   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
  77   %x.abs = call float @llvm.fabs.f32(float %x)
  78   %mul2 = fmul fast float %x.abs, 2.0
  79   %mad0 = fadd fast float %mul2, %y
  80   %mad1 = fadd fast float %mul2, %z
  81   store volatile float %mad0, float addrspace(1)* %out
  82   store volatile float %mad1, float addrspace(1)* %out.gep.1
  83   ret void
  84 }
  85
  86 ; GCN-LABEL: {{^}}fmul_x2_xn2_f32:
  87 ; GCN: v_mul_f32_e64 [[TMP0:v[0-9]+]], [[X:s[0-9]+]], -4.0
  88 ; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], [[X]], [[TMP0]]
  89 ; GCN: buffer_store_dword [[RESULT]]
  90 define amdgpu_kernel void @fmul_x2_xn2_f32(float addrspace(1)* %out, float %x, float %y) #0 {
  91   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
  92   %mul2 = fmul fast float %x, 2.0
  93   %muln2 = fmul fast float %x, -2.0
  94   %mul = fmul fast float %mul2, %muln2
  95   store volatile float %mul, float addrspace(1)* %out
  96   ret void
  97 }
  98
  99 ; GCN-LABEL: {{^}}fmul_x2_xn3_f32:
 100 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0xc0c00000
 101 ; GCN: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[X:s[0-9]+]], [[K]]
 102 ; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], [[X]], [[TMP0]]
 103 ; GCN: buffer_store_dword [[RESULT]]
 104 define amdgpu_kernel void @fmul_x2_xn3_f32(float addrspace(1)* %out, float %x, float %y) #0 {
 105   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
 106   %mul2 = fmul fast float %x, 2.0
 107   %muln2 = fmul fast float %x, -3.0
 108   %mul = fmul fast float %mul2, %muln2
 109   store volatile float %mul, float addrspace(1)* %out
 110   ret void
 111 }
 112
 113 ; GCN-LABEL: {{^}}multiple_fadd_use_test_f16:
 114 ; VI: v_add_f16_e64 v{{[0-9]+}}, s{{[0-9]+}}, -1.0
 115 ; VI: v_add_f16_e64 v{{[0-9]+}}, s{{[0-9]+}}, -1.0
 116 ; VI: v_cmp_gt_f16_e64 vcc, |v{{[0-9]+}}|, |v{{[0-9]+}}|
 117 ; VI: v_cndmask_b32_e32
 118 ; VI: v_add_f16_e64 v{{[0-9]+}}, |v{{[0-9]+}}|, |v{{[0-9]+}}|
 119 ; VI: v_mul_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 120 ; VI-FLUSH: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, 1.0
 121 ; VI-DENORM: v_fma_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, 1.0
 122 define amdgpu_kernel void @multiple_fadd_use_test_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg, i16 zeroext %z.arg) #0 {
 123   %x = bitcast i16 %x.arg to half
 124   %y = bitcast i16 %y.arg to half
 125   %z = bitcast i16 %z.arg to half
 126   %a11 = fadd fast half %y, -1.0
 127   %a12 = call half @llvm.fabs.f16(half %a11)
 128   %a13 = fadd fast half %x, -1.0
 129   %a14 = call half @llvm.fabs.f16(half %a13)
 130   %a15 = fcmp ogt half %a12, %a14
 131   %a16 = select i1 %a15, half %a12, half %a14
 132   %a17 = fmul fast half %a16, 2.0
 133   %a18 = fmul fast half %a17, %a17
 134   %a19 = fmul fast half %a18, %a17
 135   %a20 = fsub fast half 1.0, %a19
 136   store half %a20, half addrspace(1)* %out
 137   ret void
 138 }
 139
 140 ; GCN-LABEL: {{^}}multiple_use_fadd_fmac_f16:
 141 ; GCN-DAG: v_add_f16_e64 [[MUL2:v[0-9]+]], [[X:s[0-9]+]], s{{[0-9]+}}
 142
 143 ; VI-FLUSH-DAG: v_mac_f16_e64 [[MAD:v[0-9]+]], [[X]], 2.0
 144 ; VI-DENORM-DAG: v_fma_f16 [[MAD:v[0-9]+]], [[X]], 2.0, v{{[0-9]+}}
 145
 146 ; GCN-DAG: buffer_store_short [[MUL2]]
 147 ; GCN-DAG: buffer_store_short [[MAD]]
 148 ; GCN: s_endpgm
 149 define amdgpu_kernel void @multiple_use_fadd_fmac_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg) #0 {
 150   %x = bitcast i16 %x.arg to half
 151   %y = bitcast i16 %y.arg to half
 152   %out.gep.1 = getelementptr half, half addrspace(1)* %out, i32 1
 153   %mul2 = fmul fast half %x, 2.0
 154   %mad = fadd fast half %mul2, %y
 155   store volatile half %mul2, half addrspace(1)* %out
 156   store volatile half %mad, half addrspace(1)* %out.gep.1
 157   ret void
 158 }
 159
 160 ; GCN-LABEL: {{^}}multiple_use_fadd_fmad_f16:
 161 ; GCN-DAG: v_add_f16_e64 [[MUL2:v[0-9]+]], |[[X:s[0-9]+]]|, |s{{[0-9]+}}|
 162
 163 ; VI-FLUSH-DAG: v_mad_f16 [[MAD:v[0-9]+]], |[[X]]|, 2.0, v{{[0-9]+}}
 164 ; VI-DENORM-DAG: v_fma_f16 [[MAD:v[0-9]+]], |[[X]]|, 2.0, v{{[0-9]+}}
 165
 166 ; GCN-DAG: buffer_store_short [[MUL2]]
 167 ; GCN-DAG: buffer_store_short [[MAD]]
 168 ; GCN: s_endpgm
 169 define amdgpu_kernel void @multiple_use_fadd_fmad_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg) #0 {
 170   %x = bitcast i16 %x.arg to half
 171   %y = bitcast i16 %y.arg to half
 172   %out.gep.1 = getelementptr half, half addrspace(1)* %out, i32 1
 173   %x.abs = call half @llvm.fabs.f16(half %x)
 174   %mul2 = fmul fast half %x.abs, 2.0
 175   %mad = fadd fast half %mul2, %y
 176   store volatile half %mul2, half addrspace(1)* %out
 177   store volatile half %mad, half addrspace(1)* %out.gep.1
 178   ret void
 179 }
 180
 181 ; GCN-LABEL: {{^}}multiple_use_fadd_multi_fmad_f16:
 182 ; VI-FLUSH: v_mad_f16 {{v[0-9]+}}, |[[X:s[0-9]+]]|, 2.0, v{{[0-9]+}}
 183 ; VI-FLUSH: v_mad_f16 {{v[0-9]+}}, |[[X]]|, 2.0, v{{[0-9]+}}
 184
 185 ; VI-DENORM: v_fma_f16 {{v[0-9]+}}, |[[X:s[0-9]+]]|, 2.0, v{{[0-9]+}}
 186 ; VI-DENORM: v_fma_f16 {{v[0-9]+}}, |[[X]]|, 2.0, v{{[0-9]+}}
 187
 188 define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg, i16 zeroext %z.arg) #0 {
 189   %x = bitcast i16 %x.arg to half
 190   %y = bitcast i16 %y.arg to half
 191   %z = bitcast i16 %z.arg to half
 192   %out.gep.1 = getelementptr half, half addrspace(1)* %out, i32 1
 193   %x.abs = call half @llvm.fabs.f16(half %x)
 194   %mul2 = fmul fast half %x.abs, 2.0
 195   %mad0 = fadd fast half %mul2, %y
 196   %mad1 = fadd fast half %mul2, %z
 197   store volatile half %mad0, half addrspace(1)* %out
 198   store volatile half %mad1, half addrspace(1)* %out.gep.1
 199   ret void
 200 }
 201
 202 ; GCN-LABEL: {{^}}fmul_x2_xn2_f16:
 203 ; GCN: v_mul_f16_e64 [[TMP0:v[0-9]+]], [[X:s[0-9]+]], -4.0
 204 ; GCN: v_mul_f16_e32 [[RESULT:v[0-9]+]], [[X]], [[TMP0]]
 205 ; GCN: buffer_store_short [[RESULT]]
 206 define amdgpu_kernel void @fmul_x2_xn2_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg) #0 {
 207   %x = bitcast i16 %x.arg to half
 208   %y = bitcast i16 %y.arg to half
 209   %out.gep.1 = getelementptr half, half addrspace(1)* %out, i32 1
 210   %mul2 = fmul fast half %x, 2.0
 211   %muln2 = fmul fast half %x, -2.0
 212   %mul = fmul fast half %mul2, %muln2
 213   store volatile half %mul, half addrspace(1)* %out
 214   ret void
 215 }
 216
 217 ; GCN-LABEL: {{^}}fmul_x2_xn3_f16:
 218 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0xc600
 219 ; GCN: v_mul_f16_e32 [[TMP0:v[0-9]+]], [[X:s[0-9]+]], [[K]]
 220 ; GCN: v_mul_f16_e32 [[RESULT:v[0-9]+]], [[X]], [[TMP0]]
 221 ; GCN: buffer_store_short [[RESULT]]
 222 define amdgpu_kernel void @fmul_x2_xn3_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg) #0 {
 223   %x = bitcast i16 %x.arg to half
 224   %y = bitcast i16 %y.arg to half
 225   %out.gep.1 = getelementptr half, half addrspace(1)* %out, i32 1
 226   %mul2 = fmul fast half %x, 2.0
 227   %muln2 = fmul fast half %x, -3.0
 228   %mul = fmul fast half %mul2, %muln2
 229   store volatile half %mul, half addrspace(1)* %out
 230   ret void
 231 }
 232
 233 attributes #0 = { nounwind "unsafe-fp-math"="true" }
 234 attributes #1 = { nounwind readnone }