clang/test/Driver/cuda-external-tools.cu

   1 // Tests that ptxas and fatbinary are invoked correctly during CUDA
   2 // compilation.
   3 //
   4 // REQUIRES: x86-registered-target
   5 // REQUIRES: nvptx-registered-target
   6
   7 // Regular compiles with -O{0,1,2,3,4,fast}.  -O4 and -Ofast map to ptxas O3.
   8 // RUN: %clang -### --target=x86_64-linux-gnu -O0 -c %s 2>&1 \
   9 // RUN:   --offload-arch=sm_35 --cuda-path=%S/Inputs/CUDA/usr/local/cuda \
  10 // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM35,OPT0 %s
  11 // RUN: %clang -### --target=x86_64-linux-gnu -O1 -c %s 2>&1 \
  12 // RUN:   --offload-arch=sm_35 --cuda-path=%S/Inputs/CUDA/usr/local/cuda \
  13 // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM35,OPT1 %s
  14 // RUN: %clang -### --target=x86_64-linux-gnu -O2 -c %s 2>&1 \
  15 // RUN:   --offload-arch=sm_35 --cuda-path=%S/Inputs/CUDA/usr/local/cuda \
  16 // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM35,OPT2 %s
  17 // RUN: %clang -### --target=x86_64-linux-gnu -O3 -c %s 2>&1 \
  18 // RUN:   --offload-arch=sm_35 --cuda-path=%S/Inputs/CUDA/usr/local/cuda \
  19 // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM35,OPT3 %s
  20 // RUN: %clang -### --target=x86_64-linux-gnu -O4 -c %s 2>&1 \
  21 // RUN:   --offload-arch=sm_35 --cuda-path=%S/Inputs/CUDA/usr/local/cuda \
  22 // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM35,OPT3 %s
  23 // RUN: %clang -### --target=x86_64-linux-gnu -Ofast -c %s 2>&1 \
  24 // RUN:   --offload-arch=sm_35 --cuda-path=%S/Inputs/CUDA/usr/local/cuda \
  25 // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM35,OPT3 %s
  26 // Generating relocatable device code
  27 // RUN: %clang -### --target=x86_64-linux-gnu -fgpu-rdc -c %s 2>&1 \
  28 // RUN:   --offload-arch=sm_35 --cuda-path=%S/Inputs/CUDA/usr/local/cuda \
  29 // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM35,RDC %s
  30
  31 // With debugging enabled, ptxas should be run with with no ptxas optimizations.
  32 // RUN: %clang -### --target=x86_64-linux-gnu --cuda-noopt-device-debug -O2 -g -c %s 2>&1 \
  33 // RUN:   --offload-arch=sm_35 --cuda-path=%S/Inputs/CUDA/usr/local/cuda \
  34 // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM35,DBG %s
  35
  36 // --no-cuda-noopt-device-debug overrides --cuda-noopt-device-debug.
  37 // RUN: %clang -### --target=x86_64-linux-gnu --cuda-noopt-device-debug \
  38 // RUN:   --offload-arch=sm_35 --cuda-path=%S/Inputs/CUDA/usr/local/cuda \
  39 // RUN:   --no-cuda-noopt-device-debug -O2 -c %s 2>&1 \
  40 // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM35,OPT2 %s
  41
  42 // Regular compile without -O.  This should result in us passing -O0 to ptxas.
  43 // RUN: %clang -### --target=x86_64-linux-gnu -c %s 2>&1 \
  44 // RUN:   --offload-arch=sm_35 --cuda-path=%S/Inputs/CUDA/usr/local/cuda \
  45 // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM35,OPT0 %s
  46
  47 // Regular compiles with -Os and -Oz.  For lack of a better option, we map
  48 // these to ptxas -O3.
  49 // RUN: %clang -### --target=x86_64-linux-gnu -Os -c %s 2>&1 \
  50 // RUN:   --offload-arch=sm_35 --cuda-path=%S/Inputs/CUDA/usr/local/cuda \
  51 // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM35,OPT2 %s
  52 // RUN: %clang -### --target=x86_64-linux-gnu -Oz -c %s 2>&1 \
  53 // RUN:   --offload-arch=sm_35 --cuda-path=%S/Inputs/CUDA/usr/local/cuda \
  54 // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM35,OPT2 %s
  55
  56 // Regular compile targeting sm_35.
  57 // RUN: %clang -### --target=x86_64-linux-gnu --cuda-gpu-arch=sm_35 -c %s 2>&1 \
  58 // RUN:   --offload-arch=sm_35 --cuda-path=%S/Inputs/CUDA/usr/local/cuda \
  59 // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM35 %s
  60 // Separate compilation targeting sm_35.
  61 // RUN: %clang -### --target=x86_64-linux-gnu --cuda-gpu-arch=sm_35 -fgpu-rdc -c %s 2>&1 \
  62 // RUN:   --offload-arch=sm_35 --cuda-path=%S/Inputs/CUDA/usr/local/cuda \
  63 // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM35,RDC %s
  64
  65 // 32-bit compile.
  66 // RUN: %clang -### --target=i386-linux-gnu -c %s 2>&1 \
  67 // RUN:   --offload-arch=sm_35 --cuda-path=%S/Inputs/CUDA/usr/local/cuda \
  68 // RUN: | FileCheck -check-prefixes=CHECK,ARCH32,SM35 %s
  69 // 32-bit compile when generating relocatable device code.
  70 // RUN: %clang -### --target=i386-linux-gnu -fgpu-rdc -c %s 2>&1 \
  71 // RUN:   --offload-arch=sm_35 --cuda-path=%S/Inputs/CUDA/usr/local/cuda \
  72 // RUN: | FileCheck -check-prefixes=CHECK,ARCH32,SM35,RDC %s
  73
  74 // Compile with -fintegrated-as.  This should still cause us to invoke ptxas.
  75 // RUN: %clang -### --target=x86_64-linux-gnu -fintegrated-as -c %s 2>&1 \
  76 // RUN:   --offload-arch=sm_35 --cuda-path=%S/Inputs/CUDA/usr/local/cuda \
  77 // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM35,OPT0 %s
  78 // Check that we still pass -c when generating relocatable device code.
  79 // RUN: %clang -### --target=x86_64-linux-gnu -fintegrated-as -fgpu-rdc -c %s 2>&1 \
  80 // RUN:   --offload-arch=sm_35 --cuda-path=%S/Inputs/CUDA/usr/local/cuda \
  81 // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM35,RDC %s
  82
  83 // Check -Xcuda-ptxas and -Xcuda-fatbinary
  84 // RUN: %clang -### --target=x86_64-linux-gnu -c -Xcuda-ptxas -foo1 \
  85 // RUN:   --offload-arch=sm_35 --cuda-path=%S/Inputs/CUDA/usr/local/cuda \
  86 // RUN:   -Xcuda-fatbinary -bar1 -Xcuda-ptxas -foo2 -Xcuda-fatbinary -bar2 %s 2>&1 \
  87 // RUN: | FileCheck -check-prefixes=CHECK,SM35,PTXAS-EXTRA,FATBINARY-EXTRA %s
  88
  89 // MacOS spot-checks
  90 // RUN: %clang -### --target=x86_64-apple-macosx -O0 -c %s 2>&1 \
  91 // RUN:   --offload-arch=sm_35 --cuda-path=%S/Inputs/CUDA/usr/local/cuda \
  92 // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM35,OPT0 %s
  93 // RUN: %clang -### --target=x86_64-apple-macosx --cuda-gpu-arch=sm_35 -c %s 2>&1 \
  94 // RUN:   --offload-arch=sm_35 --cuda-path=%S/Inputs/CUDA/usr/local/cuda \
  95 // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM35 %s
  96 // RUN: %clang -### --target=i386-apple-macosx -c %s 2>&1 \
  97 // RUN:   --offload-arch=sm_35 --cuda-path=%S/Inputs/CUDA/usr/local/cuda \
  98 // RUN: | FileCheck -check-prefixes=CHECK,ARCH32,SM35 %s
  99
 100 // Check relocatable device code generation on MacOS.
 101 // RUN: %clang -### --target=x86_64-apple-macosx -O0 -fgpu-rdc -c %s 2>&1 \
 102 // RUN:   --offload-arch=sm_35 --cuda-path=%S/Inputs/CUDA/usr/local/cuda \
 103 // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM35,RDC %s
 104 // RUN: %clang -### --target=x86_64-apple-macosx --cuda-gpu-arch=sm_35 -fgpu-rdc -c %s 2>&1 \
 105 // RUN:   --offload-arch=sm_35 --cuda-path=%S/Inputs/CUDA/usr/local/cuda \
 106 // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM35,RDC %s
 107 // RUN: %clang -### --target=i386-apple-macosx -fgpu-rdc -c %s 2>&1 \
 108 // RUN:   --offload-arch=sm_35 --cuda-path=%S/Inputs/CUDA/usr/local/cuda \
 109 // RUN: | FileCheck -check-prefixes=CHECK,ARCH32,SM35,RDC %s
 110
 111 // Check that CLANG forwards the -v flag to PTXAS.
 112 // RUN: %clang -### -save-temps -v %s 2>&1 \
 113 // RUN:   --offload-arch=sm_35 --cuda-path=%S/Inputs/CUDA/usr/local/cuda \
 114 // RUN: | FileCheck -check-prefix=CHK-PTXAS-VERBOSE %s
 115
 116 // Match clang job that produces PTX assembly.
 117 // CHECK: "-cc1"
 118 // ARCH64-SAME: "-triple" "nvptx64-nvidia-cuda"
 119 // ARCH32-SAME: "-triple" "nvptx-nvidia-cuda"
 120 // SM35-SAME: "-target-cpu" "sm_35"
 121 // RDC-SAME: "-fgpu-rdc"
 122 // CHECK-NOT: "-fgpu-rdc"
 123 // SM35-SAME: "-o" "[[PTXFILE:[^"]*]]"
 124
 125 // Match the call to ptxas (which assembles PTX to SASS).
 126 // CHECK: ptxas
 127 // ARCH64-SAME: "-m64"
 128 // ARCH32-SAME: "-m32"
 129 // OPT0-SAME: "-O0"
 130 // OPT0-NOT: "-g"
 131 // OPT1-SAME: "-O1"
 132 // OPT1-NOT: "-g"
 133 // OPT2-SAME: "-O2"
 134 // OPT2-NOT: "-g"
 135 // OPT3-SAME: "-O3"
 136 // OPT3-NOT: "-g"
 137 // DBG-SAME: "-g" "--dont-merge-basicblocks" "--return-at-end"
 138 // SM35-SAME: "--gpu-name" "sm_35"
 139 // SM35-SAME: "--output-file" "[[CUBINFILE:[^"]*]]"
 140 // CHECK-SAME: "[[PTXFILE]]"
 141 // PTXAS-EXTRA-SAME: "-foo1"
 142 // PTXAS-EXTRA-SAME: "-foo2"
 143 // RDC-SAME: "-c"
 144 // CHECK-NOT: "-c"
 145
 146 // Match the call to fatbinary (which combines all our PTX and SASS into one
 147 // blob).
 148 // CHECK: fatbinary
 149 // CHECK-SAME-DAG: "--cuda"
 150 // ARCH64-SAME-DAG: "-64"
 151 // ARCH32-SAME-DAG: "-32"
 152 // CHECK-DAG: "--create" "[[FATBINARY:[^"]*]]"
 153 // SM35-SAME-DAG: "--image=profile=compute_35,file=[[PTXFILE]]"
 154 // SM35-SAME-DAG: "--image=profile=sm_35,file=[[CUBINFILE]]"
 155 // FATBINARY-EXTRA-SAME: "-bar1"
 156 // FATBINARY-EXTRA-SAME: "-bar2"
 157
 158 // Match the clang job for host compilation.
 159 // CHECK: "-cc1"
 160 // ARCH64-SAME: "-triple" "x86_64-
 161 // ARCH32-SAME: "-triple" "i386-
 162 // CHECK-SAME: "-fcuda-include-gpubinary" "[[FATBINARY]]"
 163
 164 // CHK-PTXAS-VERBOSE: ptxas{{.*}}" "-v"