clang/test/Driver/cuda-external-tools.cu

   1 // Tests that ptxas and fatbinary are invoked correctly during CUDA
   2 // compilation.
   3
   4 // Regular compiles with -O{0,1,2,3,4,fast}.  -O4 and -Ofast map to ptxas O3.
   5 // RUN: %clang -### --target=x86_64-linux-gnu -O0 -c %s 2>&1 \
   6 // RUN:   --offload-arch=sm_35 --cuda-path=%S/Inputs/CUDA/usr/local/cuda \
   7 // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM35,OPT0 %s
   8 // RUN: %clang -### --target=x86_64-linux-gnu -O1 -c %s 2>&1 \
   9 // RUN:   --offload-arch=sm_35 --cuda-path=%S/Inputs/CUDA/usr/local/cuda \
  10 // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM35,OPT1 %s
  11 // RUN: %clang -### --target=x86_64-linux-gnu -O2 -c %s 2>&1 \
  12 // RUN:   --offload-arch=sm_35 --cuda-path=%S/Inputs/CUDA/usr/local/cuda \
  13 // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM35,OPT2 %s
  14 // RUN: %clang -### --target=x86_64-linux-gnu -O3 -c %s 2>&1 \
  15 // RUN:   --offload-arch=sm_35 --cuda-path=%S/Inputs/CUDA/usr/local/cuda \
  16 // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM35,OPT3 %s
  17 // RUN: %clang -### --target=x86_64-linux-gnu -O4 -c %s 2>&1 \
  18 // RUN:   --offload-arch=sm_35 --cuda-path=%S/Inputs/CUDA/usr/local/cuda \
  19 // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM35,OPT3 %s
  20 // RUN: %clang -### --target=x86_64-linux-gnu -Ofast -c %s 2>&1 \
  21 // RUN:   --offload-arch=sm_35 --cuda-path=%S/Inputs/CUDA/usr/local/cuda \
  22 // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM35,OPT3 %s
  23 // Generating relocatable device code
  24 // RUN: %clang -### --target=x86_64-linux-gnu -fgpu-rdc -c %s 2>&1 \
  25 // RUN:   --no-offload-new-driver --offload-arch=sm_35 --cuda-path=%S/Inputs/CUDA/usr/local/cuda \
  26 // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM35,RDC %s
  27
  28 // With debugging enabled, ptxas should be run with with no ptxas optimizations.
  29 // RUN: %clang -### --target=x86_64-linux-gnu --cuda-noopt-device-debug -O2 -g -c %s 2>&1 \
  30 // RUN:   --offload-arch=sm_35 --cuda-path=%S/Inputs/CUDA/usr/local/cuda \
  31 // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM35,DBG %s
  32
  33 // --no-cuda-noopt-device-debug overrides --cuda-noopt-device-debug.
  34 // RUN: %clang -### --target=x86_64-linux-gnu --cuda-noopt-device-debug \
  35 // RUN:   --offload-arch=sm_35 --cuda-path=%S/Inputs/CUDA/usr/local/cuda \
  36 // RUN:   --no-cuda-noopt-device-debug -O2 -c %s 2>&1 \
  37 // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM35,OPT2 %s
  38
  39 // Regular compile without -O.  This should result in us passing -O0 to ptxas.
  40 // RUN: %clang -### --target=x86_64-linux-gnu -c %s 2>&1 \
  41 // RUN:   --offload-arch=sm_35 --cuda-path=%S/Inputs/CUDA/usr/local/cuda \
  42 // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM35,OPT0 %s
  43
  44 // Regular compiles with -Os and -Oz.  For lack of a better option, we map
  45 // these to ptxas -O3.
  46 // RUN: %clang -### --target=x86_64-linux-gnu -Os -c %s 2>&1 \
  47 // RUN:   --offload-arch=sm_35 --cuda-path=%S/Inputs/CUDA/usr/local/cuda \
  48 // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM35,OPT2 %s
  49 // RUN: %clang -### --target=x86_64-linux-gnu -Oz -c %s 2>&1 \
  50 // RUN:   --offload-arch=sm_35 --cuda-path=%S/Inputs/CUDA/usr/local/cuda \
  51 // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM35,OPT2 %s
  52
  53 // Regular compile targeting sm_35.
  54 // RUN: %clang -### --target=x86_64-linux-gnu --cuda-gpu-arch=sm_35 -c %s 2>&1 \
  55 // RUN:   --offload-arch=sm_35 --cuda-path=%S/Inputs/CUDA/usr/local/cuda \
  56 // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM35 %s
  57 // Separate compilation targeting sm_35.
  58 // RUN: %clang -### --target=x86_64-linux-gnu --cuda-gpu-arch=sm_35 -fgpu-rdc -c %s 2>&1 \
  59 // RUN:   --no-offload-new-driver --offload-arch=sm_35 --cuda-path=%S/Inputs/CUDA/usr/local/cuda \
  60 // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM35,RDC %s
  61
  62 // 32-bit compile.
  63 // RUN: %clang -### --target=i386-linux-gnu -c %s 2>&1 \
  64 // RUN:   --offload-arch=sm_35 --cuda-path=%S/Inputs/CUDA/usr/local/cuda \
  65 // RUN: | FileCheck -check-prefixes=CHECK,ARCH32,SM35 %s
  66 // 32-bit compile when generating relocatable device code.
  67 // RUN: %clang -### --target=i386-linux-gnu -fgpu-rdc -c %s 2>&1 \
  68 // RUN:   --no-offload-new-driver --offload-arch=sm_35 --cuda-path=%S/Inputs/CUDA/usr/local/cuda \
  69 // RUN: | FileCheck -check-prefixes=CHECK,ARCH32,SM35,RDC %s
  70
  71 // Compile with -fintegrated-as.  This should still cause us to invoke ptxas.
  72 // RUN: %clang -### --target=x86_64-linux-gnu -fintegrated-as -c %s 2>&1 \
  73 // RUN:   --offload-arch=sm_35 --cuda-path=%S/Inputs/CUDA/usr/local/cuda \
  74 // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM35,OPT0 %s
  75 // Check that we still pass -c when generating relocatable device code.
  76 // RUN: %clang -### --target=x86_64-linux-gnu -fintegrated-as -fgpu-rdc -c %s 2>&1 \
  77 // RUN:   --no-offload-new-driver --offload-arch=sm_35 --cuda-path=%S/Inputs/CUDA/usr/local/cuda \
  78 // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM35,RDC %s
  79
  80 // Check -Xcuda-ptxas and -Xcuda-fatbinary
  81 // RUN: %clang -### --target=x86_64-linux-gnu -c -Xcuda-ptxas -foo1 \
  82 // RUN:   --offload-arch=sm_35 --cuda-path=%S/Inputs/CUDA/usr/local/cuda \
  83 // RUN:   -Xcuda-fatbinary -bar1 -Xcuda-ptxas -foo2 -Xcuda-fatbinary -bar2 %s 2>&1 \
  84 // RUN: | FileCheck -check-prefixes=CHECK,SM35,PTXAS-EXTRA,FATBINARY-EXTRA %s
  85
  86 // Check -Xcuda-ptxas with clang-cl
  87 // RUN: %clang_cl -### -c -Xcuda-ptxas -foo1 \
  88 // RUN:   --offload-arch=sm_35 --cuda-path=%S/Inputs/CUDA/usr/local/cuda \
  89 // RUN:   -Xcuda-ptxas -foo2 -- %s 2>&1 \
  90 // RUN: | FileCheck -check-prefixes=CHECK,SM35,PTXAS-EXTRA %s
  91
  92 // MacOS spot-checks
  93 // RUN: %clang -### --target=x86_64-apple-macosx -O0 -c %s 2>&1 \
  94 // RUN:   --offload-arch=sm_35 --cuda-path=%S/Inputs/CUDA/usr/local/cuda \
  95 // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM35,OPT0 %s
  96 // RUN: %clang -### --target=x86_64-apple-macosx --cuda-gpu-arch=sm_35 -c %s 2>&1 \
  97 // RUN:   --offload-arch=sm_35 --cuda-path=%S/Inputs/CUDA/usr/local/cuda \
  98 // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM35 %s
  99 // RUN: %clang -### --target=i386-apple-macosx -c %s 2>&1 \
 100 // RUN:   --offload-arch=sm_35 --cuda-path=%S/Inputs/CUDA/usr/local/cuda \
 101 // RUN: | FileCheck -check-prefixes=CHECK,ARCH32,SM35 %s
 102
 103 // Check relocatable device code generation on MacOS.
 104 // RUN: %clang -### --target=x86_64-apple-macosx -O0 -fgpu-rdc -c %s 2>&1 \
 105 // RUN:   --no-offload-new-driver --offload-arch=sm_35 --cuda-path=%S/Inputs/CUDA/usr/local/cuda \
 106 // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM35,RDC %s
 107 // RUN: %clang -### --target=x86_64-apple-macosx --cuda-gpu-arch=sm_35 -fgpu-rdc -c %s 2>&1 \
 108 // RUN:   --no-offload-new-driver --offload-arch=sm_35 --cuda-path=%S/Inputs/CUDA/usr/local/cuda \
 109 // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM35,RDC %s
 110 // RUN: %clang -### --target=i386-apple-macosx -fgpu-rdc -c %s 2>&1 \
 111 // RUN:   --no-offload-new-driver --offload-arch=sm_35 --cuda-path=%S/Inputs/CUDA/usr/local/cuda \
 112 // RUN: | FileCheck -check-prefixes=CHECK,ARCH32,SM35,RDC %s
 113
 114 // Check that CLANG forwards the -v flag to PTXAS.
 115 // RUN: %clang -### -save-temps -v %s 2>&1 \
 116 // RUN:   --offload-arch=sm_35 --cuda-path=%S/Inputs/CUDA/usr/local/cuda \
 117 // RUN: | FileCheck -check-prefix=CHK-PTXAS-VERBOSE %s
 118
 119 // Match clang job that produces PTX assembly.
 120 // CHECK: "-cc1"
 121 // ARCH64-SAME: "-triple" "nvptx64-nvidia-cuda"
 122 // ARCH32-SAME: "-triple" "nvptx-nvidia-cuda"
 123 // SM35-SAME: "-target-cpu" "sm_35"
 124 // RDC-SAME: "-fgpu-rdc"
 125 // CHECK-NOT: "-fgpu-rdc"
 126 // SM35-SAME: "-o" "[[PTXFILE:[^"]*]]"
 127
 128 // Match the call to ptxas (which assembles PTX to SASS).
 129 // CHECK: ptxas
 130 // ARCH64-SAME: "-m64"
 131 // ARCH32-SAME: "-m32"
 132 // OPT0-SAME: "-O0"
 133 // OPT0-NOT: "-g"
 134 // OPT1-SAME: "-O1"
 135 // OPT1-NOT: "-g"
 136 // OPT2-SAME: "-O2"
 137 // OPT2-NOT: "-g"
 138 // OPT3-SAME: "-O3"
 139 // OPT3-NOT: "-g"
 140 // DBG-SAME: "-g" "--dont-merge-basicblocks" "--return-at-end"
 141 // SM35-SAME: "--gpu-name" "sm_35"
 142 // SM35-SAME: "--output-file" "[[CUBINFILE:[^"]*]]"
 143 // CHECK-SAME: "[[PTXFILE]]"
 144 // PTXAS-EXTRA-SAME: "-foo1"
 145 // PTXAS-EXTRA-SAME: "-foo2"
 146 // CHECK-NOT: "-foo1"
 147 // CHECK-NOT: "-foo2"
 148 // RDC-SAME: "-c"
 149 // CHECK-NOT: "-c"
 150
 151 // Match the call to fatbinary (which combines all our PTX and SASS into one
 152 // blob).
 153 // CHECK: fatbinary
 154 // CHECK-SAME-DAG: "--cuda"
 155 // ARCH64-SAME-DAG: "-64"
 156 // ARCH32-SAME-DAG: "-32"
 157 // CHECK-DAG: "--create" "[[FATBINARY:[^"]*]]"
 158 // SM35-SAME-DAG: "--image=profile=compute_35,file=[[PTXFILE]]"
 159 // SM35-SAME-DAG: "--image=profile=sm_35,file=[[CUBINFILE]]"
 160 // FATBINARY-EXTRA-SAME: "-bar1"
 161 // FATBINARY-EXTRA-SAME: "-bar2"
 162
 163 // Match the clang job for host compilation.
 164 // CHECK: "-cc1"
 165 // ARCH64-SAME: "-triple" "x86_64-
 166 // ARCH32-SAME: "-triple" "i386-
 167 // CHECK-SAME: "-fcuda-include-gpubinary" "[[FATBINARY]]"
 168
 169 // CHK-PTXAS-VERBOSE: ptxas{{.*}}" "-v"