clang/test/CodeGenCUDA/fp-contract.cu

   1 // REQUIRES: x86-registered-target, nvptx-registered-target, amdgpu-registered-target
   2
   3 // By default CUDA uses -ffp-contract=fast, HIP uses -ffp-contract=fast-honor-pragmas.
   4 // we should fuse multiply/add into fma instruction.
   5 // In IR, fmul/fadd instructions with contract flag are emitted.
   6 // In backend
   7 //    nvptx -  assumes fast fp fuse option, which fuses
   8 //             mult/add insts disregarding contract flag and
   9 //             llvm.fmuladd intrinsics.
  10 //    amdgcn - assumes standard fp fuse option, which only
  11 //             fuses mult/add insts with contract flag and
  12 //             llvm.fmuladd intrinsics.
  13
  14 // RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \
  15 // RUN:   -disable-llvm-passes -o - %s \
  16 // RUN:   | FileCheck -check-prefixes=COMMON,NV-ON %s
  17 // RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \
  18 // RUN:   -target-cpu gfx906 -disable-llvm-passes -o - -x hip %s \
  19 // RUN:   | FileCheck -check-prefixes=COMMON,AMD-ON %s
  20 // RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \
  21 // RUN:   -O3 -o - %s \
  22 // RUN:   | FileCheck -check-prefixes=COMMON,NV-OPT-FAST %s
  23 // RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \
  24 // RUN:   -O3 -target-cpu gfx906 -o - -x hip %s \
  25 // RUN:   | FileCheck -check-prefixes=COMMON,AMD-OPT-FASTSTD %s
  26
  27 // Check separate compile/backend steps corresponding to -save-temps.
  28
  29 // RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -emit-llvm \
  30 // RUN:   -O3 -disable-llvm-passes -target-cpu gfx906 -o %t.ll -x hip %s
  31 // RUN: cat %t.ll  | FileCheck -check-prefixes=COMMON,AMD-OPT-FAST-IR %s
  32 // RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -S \
  33 // RUN:   -O3 -target-cpu gfx906 -o - -x ir %t.ll \
  34 // RUN:   | FileCheck -check-prefixes=COMMON,AMD-OPT-FASTSTD %s
  35
  36 // Explicit -ffp-contract=fast
  37 // In IR, fmul/fadd instructions with contract flag are emitted.
  38 // In backend
  39 //    nvptx/amdgcn - assumes fast fp fuse option, which fuses
  40 //                   mult/add insts disregarding contract flag and
  41 //                   llvm.fmuladd intrinsics.
  42
  43 // RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \
  44 // RUN:   -ffp-contract=fast -disable-llvm-passes -o - %s \
  45 // RUN:   | FileCheck -check-prefixes=COMMON,NV-ON %s
  46 // RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \
  47 // RUN:   -target-cpu gfx906 -disable-llvm-passes -o - -x hip %s \
  48 // RUN:   -ffp-contract=fast \
  49 // RUN:   | FileCheck -check-prefixes=COMMON,AMD-ON %s
  50 // RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \
  51 // RUN:   -O3 -o - %s \
  52 // RUN:   -ffp-contract=fast \
  53 // RUN:   | FileCheck -check-prefixes=COMMON,NV-OPT-FAST %s
  54 // RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \
  55 // RUN:   -O3 -target-cpu gfx906 -o - -x hip %s \
  56 // RUN:   -ffp-contract=fast \
  57 // RUN:   | FileCheck -check-prefixes=COMMON,AMD-OPT-FAST %s
  58
  59 // Check separate compile/backend steps corresponding to -save-temps.
  60 // When input is IR, -ffp-contract has no effect. Backend uses default
  61 // default FP fuse option.
  62
  63 // RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -emit-llvm \
  64 // RUN:   -ffp-contract=fast \
  65 // RUN:   -O3 -disable-llvm-passes -target-cpu gfx906 -o %t.ll -x hip %s
  66 // RUN: cat %t.ll  | FileCheck -check-prefixes=COMMON,AMD-OPT-FAST-IR %s
  67 // RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -S \
  68 // RUN:   -O3 -target-cpu gfx906 -o - -x ir %t.ll \
  69 // RUN:   | FileCheck -check-prefixes=COMMON,AMD-OPT-FASTSTD %s
  70
  71 // Explicit -ffp-contract=fast-honor-pragmas
  72 // In IR, fmul/fadd instructions with contract flag are emitted.
  73 // In backend
  74 //    nvptx/amdgcn - assumes standard fp fuse option, which only
  75 //                   fuses mult/add insts with contract flag or
  76 //                   llvm.fmuladd intrinsics.
  77
  78 // RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \
  79 // RUN:   -ffp-contract=fast-honor-pragmas -disable-llvm-passes -o - %s \
  80 // RUN:   | FileCheck -check-prefixes=COMMON,NV-ON %s
  81 // RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \
  82 // RUN:   -target-cpu gfx906 -disable-llvm-passes -o - -x hip %s \
  83 // RUN:   -ffp-contract=fast-honor-pragmas \
  84 // RUN:   | FileCheck -check-prefixes=COMMON,AMD-ON %s
  85 // RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \
  86 // RUN:   -O3 -o - %s \
  87 // RUN:   -ffp-contract=fast-honor-pragmas \
  88 // RUN:   | FileCheck -check-prefixes=COMMON,NV-OPT-FASTSTD %s
  89 // RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \
  90 // RUN:   -O3 -target-cpu gfx906 -o - -x hip %s \
  91 // RUN:   -ffp-contract=fast-honor-pragmas \
  92 // RUN:   | FileCheck -check-prefixes=COMMON,AMD-OPT-FASTSTD %s
  93
  94 // Check separate compile/backend steps corresponding to -save-temps.
  95 // When input is IR, -ffp-contract has no effect. Backend uses default
  96 // default FP fuse option.
  97
  98 // RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -emit-llvm \
  99 // RUN:   -ffp-contract=fast-honor-pragmas \
 100 // RUN:   -O3 -disable-llvm-passes -target-cpu gfx906 -o %t.ll -x hip %s
 101 // RUN: cat %t.ll  | FileCheck -check-prefixes=COMMON,AMD-OPT-FAST-IR %s
 102 // RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -S \
 103 // RUN:   -O3 -target-cpu gfx906 -o - -x ir %t.ll \
 104 // RUN:   | FileCheck -check-prefixes=COMMON,AMD-OPT-FASTSTD %s
 105
 106 // Explicit -ffp-contract=on -- fusing by front-end.
 107 // In IR,
 108 //    mult/add in the same statement - llvm.fmuladd instrinsic emitted
 109 //    mult/add in different statement -  fmul/fadd instructions without
 110 //                                       contract flag are emitted.
 111 // In backend
 112 //    nvptx/amdgcn - assumes standard fp fuse option, which only
 113 //                   fuses mult/add insts with contract flag or
 114 //                   llvm.fmuladd intrinsics.
 115
 116 // RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \
 117 // RUN:   -ffp-contract=on -disable-llvm-passes -o - %s \
 118 // RUN:   | FileCheck -check-prefixes=COMMON,NV-ON %s
 119 // RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \
 120 // RUN:   -target-cpu gfx906 -disable-llvm-passes -o - -x hip %s \
 121 // RUN:   -ffp-contract=on \
 122 // RUN:   | FileCheck -check-prefixes=COMMON,AMD-ON %s
 123 // RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \
 124 // RUN:   -O3 -o - %s \
 125 // RUN:   -ffp-contract=on \
 126 // RUN:   | FileCheck -check-prefixes=COMMON,NV-OPT-ON %s
 127 // RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \
 128 // RUN:   -O3 -target-cpu gfx906 -o - -x hip %s \
 129 // RUN:   -ffp-contract=on \
 130 // RUN:   | FileCheck -check-prefixes=COMMON,AMD-OPT-ON %s
 131
 132 // Check separate compile/backend steps corresponding to -save-temps.
 133
 134 // RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -emit-llvm \
 135 // RUN:   -ffp-contract=on \
 136 // RUN:   -O3 -disable-llvm-passes -target-cpu gfx906 -o %t.ll -x hip %s
 137 // RUN: cat %t.ll  | FileCheck -check-prefixes=COMMON,AMD-OPT-ON-IR %s
 138 // RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -S \
 139 // RUN:   -O3 -target-cpu gfx906 -o - -x ir %t.ll \
 140 // RUN:   | FileCheck -check-prefixes=COMMON,AMD-OPT-ON %s
 141
 142 // Explicit -ffp-contract=off should disable instruction fusing.
 143 // In IR, fmul/fadd instructions without contract flag are emitted.
 144 // In backend
 145 //    nvptx/amdgcn - assumes standard fp fuse option, which only
 146 //                   fuses mult/add insts with contract flag or
 147 //                   llvm.fmuladd intrinsics.
 148
 149 // RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \
 150 // RUN:   -ffp-contract=off -disable-llvm-passes -o - %s \
 151 // RUN:   | FileCheck -check-prefixes=COMMON,NV-OFF %s
 152 // RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \
 153 // RUN:   -target-cpu gfx906 -disable-llvm-passes -o - -x hip %s \
 154 // RUN:   -ffp-contract=off \
 155 // RUN:   | FileCheck -check-prefixes=COMMON,AMD-OFF %s
 156 // RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \
 157 // RUN:   -O3 -o - %s \
 158 // RUN:   -ffp-contract=off \
 159 // RUN:   | FileCheck -check-prefixes=COMMON,NV-OPT-OFF %s
 160 // RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \
 161 // RUN:   -O3 -target-cpu gfx906 -o - -x hip %s \
 162 // RUN:   -ffp-contract=off \
 163 // RUN:   | FileCheck -check-prefixes=COMMON,AMD-OPT-OFF %s
 164
 165 // Check separate compile/backend steps corresponding to -save-temps.
 166
 167 // RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -emit-llvm \
 168 // RUN:   -ffp-contract=off \
 169 // RUN:   -O3 -disable-llvm-passes -target-cpu gfx906 -o %t.ll -x hip %s
 170 // RUN: cat %t.ll  | FileCheck -check-prefixes=COMMON,AMD-OPT-OFF-IR %s
 171 // RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -S \
 172 // RUN:   -O3 -target-cpu gfx906 -o - -x ir %t.ll \
 173 // RUN:   | FileCheck -check-prefixes=COMMON,AMD-OPT-OFF %s
 174
 175 #include "Inputs/cuda.h"
 176
 177 // Test multiply/add in the same statement, which can be emitted as FMA when
 178 // fp-contract is on or fast.
 179 __host__ __device__ float func(float a, float b, float c) { return a + b * c; }
 180 // COMMON-LABEL: _Z4funcfff
 181 // NV-ON:       fma.rn.f32
 182 // NV-ON-NEXT:  st.param.f32
 183 // AMD-ON:       v_fmac_f32_e64
 184 // AMD-ON-NEXT:  s_setpc_b64
 185
 186 // NV-OFF:      mul.rn.f32
 187 // NV-OFF-NEXT: add.rn.f32
 188 // NV-OFF-NEXT: st.param.f32
 189 // AMD-OFF:      v_mul_f32_e64
 190 // AMD-OFF-NEXT: v_add_f32_e64
 191 // AMD-OFF-NEXT: s_setpc_b64
 192
 193 // NV-OPT-FAST: fma.rn.f32
 194 // NV-OPT-FAST-NEXT: st.param.f32
 195 // NV-OPT-FASTSTD: fma.rn.f32
 196 // NV-OPT-FASTSTD-NEXT: st.param.f32
 197 // NV-OPT-ON: fma.rn.f32
 198 // NV-OPT-ON-NEXT: st.param.f32
 199 // NV-OPT-OFF: mul.rn.f32
 200 // NV-OPT-OFF-NEXT: add.rn.f32
 201 // NV-OPT-OFF-NEXT: st.param.f32
 202
 203 // AMD-OPT-FAST-IR: fmul contract float
 204 // AMD-OPT-FAST-IR: fadd contract float
 205 // AMD-OPT-ON-IR: @llvm.fmuladd.f32
 206 // AMD-OPT-OFF-IR: fmul float
 207 // AMD-OPT-OFF-IR: fadd float
 208
 209 // AMD-OPT-FAST: v_fmac_f32_e32
 210 // AMD-OPT-FAST-NEXT: s_setpc_b64
 211 // AMD-OPT-FASTSTD: v_fmac_f32_e32
 212 // AMD-OPT-FASTSTD-NEXT: s_setpc_b64
 213 // AMD-OPT-ON: v_fmac_f32_e32
 214 // AMD-OPT-ON-NEXT: s_setpc_b64
 215 // AMD-OPT-OFF: v_mul_f32_e32
 216 // AMD-OPT-OFF-NEXT: v_add_f32_e32
 217 // AMD-OPT-OFF-NEXT: s_setpc_b64
 218
 219 // Test multiply/add in the different statements, which can be emitted as
 220 // FMA when fp-contract is fast but not on.
 221 __host__ __device__ float func2(float a, float b, float c) {
 222   float t = b * c;
 223   return t + a;
 224 }
 225 // COMMON-LABEL: _Z5func2fff
 226 // NV-OPT-FAST: fma.rn.f32
 227 // NV-OPT-FAST-NEXT: st.param.f32
 228 // NV-OPT-FASTSTD: fma.rn.f32
 229 // NV-OPT-FASTSTD-NEXT: st.param.f32
 230 // NV-OPT-ON: mul.rn.f32
 231 // NV-OPT-ON: add.rn.f32
 232 // NV-OPT-ON-NEXT: st.param.f32
 233 // NV-OPT-OFF: mul.rn.f32
 234 // NV-OPT-OFF: add.rn.f32
 235 // NV-OPT-OFF-NEXT: st.param.f32
 236
 237 // AMD-OPT-FAST-IR: fmul contract float
 238 // AMD-OPT-FAST-IR: fadd contract float
 239 // AMD-OPT-ON-IR: fmul float
 240 // AMD-OPT-ON-IR: fadd float
 241 // AMD-OPT-OFF-IR: fmul float
 242 // AMD-OPT-OFF-IR: fadd float
 243
 244 // AMD-OPT-FAST: v_fmac_f32_e32
 245 // AMD-OPT-FAST-NEXT: s_setpc_b64
 246 // AMD-OPT-FASTSTD: v_fmac_f32_e32
 247 // AMD-OPT-FASTSTD-NEXT: s_setpc_b64
 248 // AMD-OPT-ON: v_mul_f32_e32
 249 // AMD-OPT-ON-NEXT: v_add_f32_e32
 250 // AMD-OPT-ON-NEXT: s_setpc_b64
 251 // AMD-OPT-OFF: v_mul_f32_e32
 252 // AMD-OPT-OFF-NEXT: v_add_f32_e32
 253 // AMD-OPT-OFF-NEXT: s_setpc_b64
 254
 255 // Test multiply/add in the different statements, which is forced
 256 // to be compiled with fp contract on. fmul/fadd without contract
 257 // flags are emitted in IR. In nvptx, they are emitted as FMA in
 258 // fp-contract is fast but not on, as nvptx backend uses the same
 259 // fp fuse option as front end, whereas fast fp fuse option in
 260 // backend fuses fadd/fmul disregarding contract flag. In amdgcn
 261 // they are not fused as amdgcn always use standard fp fusion
 262 // option which respects contract flag.
 263   __host__ __device__ float func3(float a, float b, float c) {
 264 #pragma clang fp contract(on)
 265   float t = b * c;
 266   return t + a;
 267 }
 268 // COMMON-LABEL: _Z5func3fff
 269 // NV-OPT-FAST: fma.rn.f32
 270 // NV-OPT-FAST-NEXT: st.param.f32
 271 // NV-OPT-FASTSTD: mul.rn.f32
 272 // NV-OPT-FASTSTD: add.rn.f32
 273 // NV-OPT-FASTSTD-NEXT: st.param.f32
 274 // NV-OPT-ON: mul.rn.f32
 275 // NV-OPT-ON: add.rn.f32
 276 // NV-OPT-ON-NEXT: st.param.f32
 277 // NV-OPT-OFF: mul.rn.f32
 278 // NV-OPT-OFF: add.rn.f32
 279 // NV-OPT-OFF-NEXT: st.param.f32
 280
 281 // AMD-OPT-FAST-IR: fmul float
 282 // AMD-OPT-FAST-IR: fadd float
 283 // AMD-OPT-ON-IR: fmul float
 284 // AMD-OPT-ON-IR: fadd float
 285 // AMD-OPT-OFF-IR: fmul float
 286 // AMD-OPT-OFF-IR: fadd float
 287
 288 // AMD-OPT-FAST: v_fmac_f32_e32
 289 // AMD-OPT-FAST-NEXT: s_setpc_b64
 290 // AMD-OPT-FASTSTD: v_mul_f32_e32
 291 // AMD-OPT-FASTSTD-NEXT: v_add_f32_e32
 292 // AMD-OPT-FASTSTD-NEXT: s_setpc_b64
 293 // AMD-OPT-ON: v_mul_f32_e32
 294 // AMD-OPT-ON-NEXT: v_add_f32_e32
 295 // AMD-OPT-ON-NEXT: s_setpc_b64
 296 // AMD-OPT-OFF: v_mul_f32_e32
 297 // AMD-OPT-OFF-NEXT: v_add_f32_e32
 298 // AMD-OPT-OFF-NEXT: s_setpc_b64