pkgs/development/python-modules/pytorch/default.nix

   1 { stdenv, lib, fetchFromGitHub, fetchpatch, buildPythonPackage, python,
   2   cudaSupport ? false, cudatoolkit, cudnn, nccl, magma,
   3   mklDnnSupport ? true, useSystemNccl ? true,
   4   MPISupport ? false, mpi,
   5   buildDocs ? false,
   6   cudaArchList ? null,
   7
   8   # Native build inputs
   9   cmake, util-linux, linkFarm, symlinkJoin, which,
  10
  11   # Build inputs
  12   numactl,
  13
  14   # Propagated build inputs
  15   dataclasses, numpy, pyyaml, cffi, click, typing-extensions,
  16
  17   # Unit tests
  18   hypothesis, psutil,
  19
  20   # virtual pkg that consistently instantiates blas across nixpkgs
  21   # See https://github.com/NixOS/nixpkgs/pull/83888
  22   blas,
  23
  24   # ninja (https://ninja-build.org) must be available to run C++ extensions tests,
  25   ninja,
  26
  27   # dependencies for torch.utils.tensorboard
  28   pillow, six, future, tensorflow-tensorboard, protobuf,
  29
  30   isPy3k, pythonOlder }:
  31
  32 # assert that everything needed for cuda is present and that the correct cuda versions are used
  33 assert !cudaSupport || (let majorIs = lib.versions.major cudatoolkit.version;
  34                         in majorIs == "9" || majorIs == "10" || majorIs == "11");
  35
  36 # confirm that cudatoolkits are sync'd across dependencies
  37 assert !(MPISupport && cudaSupport) || mpi.cudatoolkit == cudatoolkit;
  38 assert !cudaSupport || magma.cudatoolkit == cudatoolkit;
  39
  40 let
  41   setBool = v: if v then "1" else "0";
  42   cudatoolkit_joined = symlinkJoin {
  43     name = "${cudatoolkit.name}-unsplit";
  44     # nccl is here purely for semantic grouping it could be moved to nativeBuildInputs
  45     paths = [ cudatoolkit.out cudatoolkit.lib nccl.dev nccl.out ];
  46   };
  47
  48   # Give an explicit list of supported architectures for the build, See:
  49   # - pytorch bug report: https://github.com/pytorch/pytorch/issues/23573
  50   # - pytorch-1.2.0 build on nixpks: https://github.com/NixOS/nixpkgs/pull/65041
  51   #
  52   # This list was selected by omitting the TORCH_CUDA_ARCH_LIST parameter,
  53   # observing the fallback option (which selected all architectures known
  54   # from cudatoolkit_10_0, pytorch-1.2, and python-3.6), and doing a binary
  55   # searching to find offending architectures.
  56   #
  57   # NOTE: Because of sandboxing, this derivation can't auto-detect the hardware's
  58   # cuda architecture, so there is also now a problem around new architectures
  59   # not being supported until explicitly added to this derivation.
  60   #
  61   # FIXME: CMake is throwing the following warning on python-1.2:
  62   #
  63   # ```
  64   # CMake Warning at cmake/public/utils.cmake:172 (message):
  65   #   In the future we will require one to explicitly pass TORCH_CUDA_ARCH_LIST
  66   #   to cmake instead of implicitly setting it as an env variable.  This will
  67   #   become a FATAL_ERROR in future version of pytorch.
  68   # ```
  69   # If this is causing problems for your build, this derivation may have to strip
  70   # away the standard `buildPythonPackage` and use the
  71   # [*Adjust Build Options*](https://github.com/pytorch/pytorch/tree/v1.2.0#adjust-build-options-optional)
  72   # instructions. This will also add more flexibility around configurations
  73   # (allowing FBGEMM to be built in pytorch-1.1), and may future proof this
  74   # derivation.
  75   brokenArchs = [ "3.0" ]; # this variable is only used as documentation.
  76
  77   cudaCapabilities = rec {
  78     cuda9 = [
  79       "3.5"
  80       "5.0"
  81       "5.2"
  82       "6.0"
  83       "6.1"
  84       "7.0"
  85       "7.0+PTX"  # I am getting a "undefined architecture compute_75" on cuda 9
  86                  # which leads me to believe this is the final cuda-9-compatible architecture.
  87     ];
  88
  89     cuda10 = cuda9 ++ [
  90       "7.5"
  91       "7.5+PTX"  # < most recent architecture as of cudatoolkit_10_0 and pytorch-1.2.0
  92     ];
  93
  94     cuda11 = cuda10 ++ [
  95       "8.0"
  96       "8.0+PTX"  # < CUDA toolkit 11.0
  97       "8.6"
  98       "8.6+PTX"  # < CUDA toolkit 11.1
  99     ];
 100   };
 101   final_cudaArchList =
 102     if !cudaSupport || cudaArchList != null
 103     then cudaArchList
 104     else cudaCapabilities."cuda${lib.versions.major cudatoolkit.version}";
 105
 106   # Normally libcuda.so.1 is provided at runtime by nvidia-x11 via
 107   # LD_LIBRARY_PATH=/run/opengl-driver/lib.  We only use the stub
 108   # libcuda.so from cudatoolkit for running tests, so that we don’t have
 109   # to recompile pytorch on every update to nvidia-x11 or the kernel.
 110   cudaStub = linkFarm "cuda-stub" [{
 111     name = "libcuda.so.1";
 112     path = "${cudatoolkit}/lib/stubs/libcuda.so";
 113   }];
 114   cudaStubEnv = lib.optionalString cudaSupport
 115     "LD_LIBRARY_PATH=${cudaStub}\${LD_LIBRARY_PATH:+:}$LD_LIBRARY_PATH ";
 116
 117 in buildPythonPackage rec {
 118   pname = "pytorch";
 119   # Don't forget to update pytorch-bin to the same version.
 120   version = "1.8.1";
 121
 122   disabled = !isPy3k;
 123
 124   outputs = [
 125     "out"   # output standard python package
 126     "dev"   # output libtorch headers
 127     "lib"   # output libtorch libraries
 128   ];
 129
 130   src = fetchFromGitHub {
 131     owner  = "pytorch";
 132     repo   = "pytorch";
 133     rev    = "v${version}";
 134     fetchSubmodules = true;
 135     sha256 = "sha256-HERbvmrfhWwH164GFHU/M0KbhVAuhI5sBZSxCZy8mRk=";
 136   };
 137
 138   patches = lib.optionals stdenv.isDarwin [
 139     # pthreadpool added support for Grand Central Dispatch in April
 140     # 2020. However, this relies on functionality (DISPATCH_APPLY_AUTO)
 141     # that is available starting with macOS 10.13. However, our current
 142     # base is 10.12. Until we upgrade, we can fall back on the older
 143     # pthread support.
 144     ./pthreadpool-disable-gcd.diff
 145   ];
 146
 147   # The dataclasses module is included with Python >= 3.7. This should
 148   # be fixed with the next PyTorch release.
 149   postPatch = ''
 150     substituteInPlace setup.py \
 151       --replace "'dataclasses'" "'dataclasses; python_version < \"3.7\"'"
 152   '';
 153
 154   preConfigure = lib.optionalString cudaSupport ''
 155     export TORCH_CUDA_ARCH_LIST="${lib.strings.concatStringsSep ";" final_cudaArchList}"
 156     export CC=${cudatoolkit.cc}/bin/gcc CXX=${cudatoolkit.cc}/bin/g++
 157   '' + lib.optionalString (cudaSupport && cudnn != null) ''
 158     export CUDNN_INCLUDE_DIR=${cudnn}/include
 159   '';
 160
 161   # Use pytorch's custom configurations
 162   dontUseCmakeConfigure = true;
 163
 164   BUILD_NAMEDTENSOR = setBool true;
 165   BUILD_DOCS = setBool buildDocs;
 166
 167   # We only do an imports check, so do not build tests either.
 168   BUILD_TEST = setBool false;
 169
 170   # Unlike MKL, oneDNN (née MKLDNN) is FOSS, so we enable support for
 171   # it by default. PyTorch currently uses its own vendored version
 172   # of oneDNN through Intel iDeep.
 173   USE_MKLDNN = setBool mklDnnSupport;
 174   USE_MKLDNN_CBLAS = setBool mklDnnSupport;
 175
 176   preBuild = ''
 177     export MAX_JOBS=$NIX_BUILD_CORES
 178     ${python.interpreter} setup.py build --cmake-only
 179     ${cmake}/bin/cmake build
 180   '';
 181
 182   preFixup = ''
 183     function join_by { local IFS="$1"; shift; echo "$*"; }
 184     function strip2 {
 185       IFS=':'
 186       read -ra RP <<< $(patchelf --print-rpath $1)
 187       IFS=' '
 188       RP_NEW=$(join_by : ''${RP[@]:2})
 189       patchelf --set-rpath \$ORIGIN:''${RP_NEW} "$1"
 190     }
 191     for f in $(find ''${out} -name 'libcaffe2*.so')
 192     do
 193       strip2 $f
 194     done
 195   '';
 196
 197   # Override the (weirdly) wrong version set by default. See
 198   # https://github.com/NixOS/nixpkgs/pull/52437#issuecomment-449718038
 199   # https://github.com/pytorch/pytorch/blob/v1.0.0/setup.py#L267
 200   PYTORCH_BUILD_VERSION = version;
 201   PYTORCH_BUILD_NUMBER = 0;
 202
 203   USE_SYSTEM_NCCL=setBool useSystemNccl;                  # don't build pytorch's third_party NCCL
 204
 205   # Suppress a weird warning in mkl-dnn, part of ideep in pytorch
 206   # (upstream seems to have fixed this in the wrong place?)
 207   # https://github.com/intel/mkl-dnn/commit/8134d346cdb7fe1695a2aa55771071d455fae0bc
 208   # https://github.com/pytorch/pytorch/issues/22346
 209   #
 210   # Also of interest: pytorch ignores CXXFLAGS uses CFLAGS for both C and C++:
 211   # https://github.com/pytorch/pytorch/blob/v1.2.0/setup.py#L17
 212   NIX_CFLAGS_COMPILE = lib.optionals (blas.implementation == "mkl") [ "-Wno-error=array-bounds" ];
 213
 214   nativeBuildInputs = [
 215     cmake
 216     util-linux
 217     which
 218     ninja
 219   ] ++ lib.optionals cudaSupport [ cudatoolkit_joined ];
 220
 221   buildInputs = [ blas blas.provider ]
 222     ++ lib.optionals cudaSupport [ cudnn magma nccl ]
 223     ++ lib.optionals stdenv.isLinux [ numactl ];
 224
 225   propagatedBuildInputs = [
 226     cffi
 227     click
 228     numpy
 229     pyyaml
 230     typing-extensions
 231     # the following are required for tensorboard support
 232     pillow six future tensorflow-tensorboard protobuf
 233   ] ++ lib.optionals MPISupport [ mpi ]
 234     ++ lib.optionals (pythonOlder "3.7") [ dataclasses ];
 235
 236   checkInputs = [ hypothesis ninja psutil ];
 237
 238   # Tests take a long time and may be flaky, so just sanity-check imports
 239   doCheck = false;
 240   pythonImportsCheck = [
 241     "torch"
 242   ];
 243
 244   checkPhase = with lib.versions; with lib.strings; concatStringsSep " " [
 245     cudaStubEnv
 246     "${python.interpreter} test/run_test.py"
 247     "--exclude"
 248     (concatStringsSep " " [
 249       "utils" # utils requires git, which is not allowed in the check phase
 250
 251       # "dataloader" # psutils correctly finds and triggers multiprocessing, but is too sandboxed to run -- resulting in numerous errors
 252       # ^^^^^^^^^^^^ NOTE: while test_dataloader does return errors, these are acceptable errors and do not interfere with the build
 253
 254       # tensorboard has acceptable failures for pytorch 1.3.x due to dependencies on tensorboard-plugins
 255       (optionalString (majorMinor version == "1.3" ) "tensorboard")
 256     ])
 257   ];
 258   postInstall = ''
 259     mkdir $dev
 260     cp -r $out/${python.sitePackages}/torch/include $dev/include
 261     cp -r $out/${python.sitePackages}/torch/share   $dev/share
 262
 263     # Fix up library paths for split outputs
 264     substituteInPlace \
 265       $dev/share/cmake/Torch/TorchConfig.cmake \
 266       --replace \''${TORCH_INSTALL_PREFIX}/lib "$lib/lib"
 267
 268     substituteInPlace \
 269       $dev/share/cmake/Caffe2/Caffe2Targets-release.cmake \
 270       --replace \''${_IMPORT_PREFIX}/lib "$lib/lib"
 271
 272     mkdir $lib
 273     cp -r $out/${python.sitePackages}/torch/lib     $lib/lib
 274   '';
 275
 276   postFixup = lib.optionalString stdenv.isDarwin ''
 277     for f in $(ls $lib/lib/*.dylib); do
 278         install_name_tool -id $lib/lib/$(basename $f) $f || true
 279     done
 280
 281     install_name_tool -change @rpath/libshm.dylib $lib/lib/libshm.dylib $lib/lib/libtorch_python.dylib
 282     install_name_tool -change @rpath/libtorch.dylib $lib/lib/libtorch.dylib $lib/lib/libtorch_python.dylib
 283     install_name_tool -change @rpath/libc10.dylib $lib/lib/libc10.dylib $lib/lib/libtorch_python.dylib
 284
 285     install_name_tool -change @rpath/libc10.dylib $lib/lib/libc10.dylib $lib/lib/libtorch.dylib
 286
 287     install_name_tool -change @rpath/libtorch.dylib $lib/lib/libtorch.dylib $lib/lib/libcaffe2_observers.dylib
 288     install_name_tool -change @rpath/libc10.dylib $lib/lib/libc10.dylib $lib/lib/libcaffe2_observers.dylib
 289
 290     install_name_tool -change @rpath/libtorch.dylib $lib/lib/libtorch.dylib $lib/lib/libcaffe2_module_test_dynamic.dylib
 291     install_name_tool -change @rpath/libc10.dylib $lib/lib/libc10.dylib $lib/lib/libcaffe2_module_test_dynamic.dylib
 292
 293     install_name_tool -change @rpath/libtorch.dylib $lib/lib/libtorch.dylib $lib/lib/libcaffe2_detectron_ops.dylib
 294     install_name_tool -change @rpath/libc10.dylib $lib/lib/libc10.dylib $lib/lib/libcaffe2_detectron_ops.dylib
 295
 296     install_name_tool -change @rpath/libtorch.dylib $lib/lib/libtorch.dylib $lib/lib/libshm.dylib
 297     install_name_tool -change @rpath/libc10.dylib $lib/lib/libc10.dylib $lib/lib/libshm.dylib
 298   '';
 299
 300   meta = with lib; {
 301     description = "Open source, prototype-to-production deep learning platform";
 302     homepage    = "https://pytorch.org/";
 303     license     = licenses.bsd3;
 304     platforms   = with platforms; linux ++ lib.optionals (!cudaSupport) darwin;
 305     maintainers = with maintainers; [ danieldk teh thoughtpolice tscholak ]; # tscholak esp. for darwin-related builds
 306     # error: use of undeclared identifier 'noU'; did you mean 'no'?
 307     broken = stdenv.isDarwin;
 308   };
 309 }