pkgs/development/python-modules/torch/default.nix

   1 { stdenv, lib, fetchFromGitHub, fetchpatch, buildPythonPackage, python,
   2   cudaSupport ? false, cudaPackages, magma,
   3   mklDnnSupport ? true, useSystemNccl ? true,
   4   MPISupport ? false, mpi,
   5   buildDocs ? false,
   6   cudaArchList ? null,
   7
   8   # Native build inputs
   9   cmake, util-linux, linkFarm, symlinkJoin, which, pybind11, removeReferencesTo,
  10
  11   # Build inputs
  12   numactl,
  13   CoreServices, libobjc,
  14
  15   # Propagated build inputs
  16   numpy, pyyaml, cffi, click, typing-extensions,
  17
  18   # Unit tests
  19   hypothesis, psutil,
  20
  21   # virtual pkg that consistently instantiates blas across nixpkgs
  22   # See https://github.com/NixOS/nixpkgs/pull/83888
  23   blas,
  24
  25   # ninja (https://ninja-build.org) must be available to run C++ extensions tests,
  26   ninja,
  27
  28   linuxHeaders_5_19,
  29
  30   # dependencies for torch.utils.tensorboard
  31   pillow, six, future, tensorboard, protobuf,
  32
  33   isPy3k, pythonOlder }:
  34
  35 let
  36   inherit (cudaPackages) cudatoolkit cudnn nccl;
  37 in
  38
  39 # assert that everything needed for cuda is present and that the correct cuda versions are used
  40 assert !cudaSupport || (let majorIs = lib.versions.major cudatoolkit.version;
  41                         in majorIs == "9" || majorIs == "10" || majorIs == "11");
  42
  43 # confirm that cudatoolkits are sync'd across dependencies
  44 assert !(MPISupport && cudaSupport) || mpi.cudatoolkit == cudatoolkit;
  45 assert !cudaSupport || magma.cudatoolkit == cudatoolkit;
  46
  47 let
  48   setBool = v: if v then "1" else "0";
  49   cudatoolkit_joined = symlinkJoin {
  50     name = "${cudatoolkit.name}-unsplit";
  51     # nccl is here purely for semantic grouping it could be moved to nativeBuildInputs
  52     paths = [ cudatoolkit.out cudatoolkit.lib nccl.dev nccl.out ];
  53   };
  54
  55   # Give an explicit list of supported architectures for the build, See:
  56   # - pytorch bug report: https://github.com/pytorch/pytorch/issues/23573
  57   # - pytorch-1.2.0 build on nixpks: https://github.com/NixOS/nixpkgs/pull/65041
  58   #
  59   # This list was selected by omitting the TORCH_CUDA_ARCH_LIST parameter,
  60   # observing the fallback option (which selected all architectures known
  61   # from cudatoolkit_10_0, pytorch-1.2, and python-3.6), and doing a binary
  62   # searching to find offending architectures.
  63   #
  64   # NOTE: Because of sandboxing, this derivation can't auto-detect the hardware's
  65   # cuda architecture, so there is also now a problem around new architectures
  66   # not being supported until explicitly added to this derivation.
  67   #
  68   # FIXME: CMake is throwing the following warning on python-1.2:
  69   #
  70   # ```
  71   # CMake Warning at cmake/public/utils.cmake:172 (message):
  72   #   In the future we will require one to explicitly pass TORCH_CUDA_ARCH_LIST
  73   #   to cmake instead of implicitly setting it as an env variable.  This will
  74   #   become a FATAL_ERROR in future version of pytorch.
  75   # ```
  76   # If this is causing problems for your build, this derivation may have to strip
  77   # away the standard `buildPythonPackage` and use the
  78   # [*Adjust Build Options*](https://github.com/pytorch/pytorch/tree/v1.2.0#adjust-build-options-optional)
  79   # instructions. This will also add more flexibility around configurations
  80   # (allowing FBGEMM to be built in pytorch-1.1), and may future proof this
  81   # derivation.
  82   brokenArchs = [ "3.0" ]; # this variable is only used as documentation.
  83
  84   cudaCapabilities = rec {
  85     cuda9 = [
  86       "3.5"
  87       "5.0"
  88       "5.2"
  89       "6.0"
  90       "6.1"
  91       "7.0"
  92       "7.0+PTX"  # I am getting a "undefined architecture compute_75" on cuda 9
  93                  # which leads me to believe this is the final cuda-9-compatible architecture.
  94     ];
  95
  96     cuda10 = cuda9 ++ [
  97       "7.5"
  98       "7.5+PTX"  # < most recent architecture as of cudatoolkit_10_0 and pytorch-1.2.0
  99     ];
 100
 101     cuda11 = cuda10 ++ [
 102       "8.0"
 103       "8.0+PTX"  # < CUDA toolkit 11.0
 104       "8.6"
 105       "8.6+PTX"  # < CUDA toolkit 11.1
 106     ];
 107   };
 108   final_cudaArchList =
 109     if !cudaSupport || cudaArchList != null
 110     then cudaArchList
 111     else cudaCapabilities."cuda${lib.versions.major cudatoolkit.version}";
 112
 113   # Normally libcuda.so.1 is provided at runtime by nvidia-x11 via
 114   # LD_LIBRARY_PATH=/run/opengl-driver/lib.  We only use the stub
 115   # libcuda.so from cudatoolkit for running tests, so that we don’t have
 116   # to recompile pytorch on every update to nvidia-x11 or the kernel.
 117   cudaStub = linkFarm "cuda-stub" [{
 118     name = "libcuda.so.1";
 119     path = "${cudatoolkit}/lib/stubs/libcuda.so";
 120   }];
 121   cudaStubEnv = lib.optionalString cudaSupport
 122     "LD_LIBRARY_PATH=${cudaStub}\${LD_LIBRARY_PATH:+:}$LD_LIBRARY_PATH ";
 123
 124 in buildPythonPackage rec {
 125   pname = "torch";
 126   # Don't forget to update torch-bin to the same version.
 127   version = "1.12.1";
 128   format = "setuptools";
 129
 130   disabled = pythonOlder "3.7.0";
 131
 132   outputs = [
 133     "out" # output standard python package
 134     "dev" # output libtorch headers
 135     "lib" # output libtorch libraries
 136   ];
 137
 138   src = fetchFromGitHub {
 139     owner = "pytorch";
 140     repo = "pytorch";
 141     rev = "refs/tags/v${version}";
 142     fetchSubmodules = true;
 143     hash = "sha256-8378BVOBFCRYRG1+yIYFSPKmb1rFOLgR+8pNZKt9NfI=";
 144   };
 145
 146   patches = lib.optionals (stdenv.isDarwin && stdenv.isx86_64) [
 147     # pthreadpool added support for Grand Central Dispatch in April
 148     # 2020. However, this relies on functionality (DISPATCH_APPLY_AUTO)
 149     # that is available starting with macOS 10.13. However, our current
 150     # base is 10.12. Until we upgrade, we can fall back on the older
 151     # pthread support.
 152     ./pthreadpool-disable-gcd.diff
 153   ];
 154
 155   preConfigure = lib.optionalString cudaSupport ''
 156     export TORCH_CUDA_ARCH_LIST="${lib.strings.concatStringsSep ";" final_cudaArchList}"
 157     export CC=${cudatoolkit.cc}/bin/gcc CXX=${cudatoolkit.cc}/bin/g++
 158   '' + lib.optionalString (cudaSupport && cudnn != null) ''
 159     export CUDNN_INCLUDE_DIR=${cudnn}/include
 160   '';
 161
 162   # Use pytorch's custom configurations
 163   dontUseCmakeConfigure = true;
 164
 165   BUILD_NAMEDTENSOR = setBool true;
 166   BUILD_DOCS = setBool buildDocs;
 167
 168   # We only do an imports check, so do not build tests either.
 169   BUILD_TEST = setBool false;
 170
 171   # Unlike MKL, oneDNN (née MKLDNN) is FOSS, so we enable support for
 172   # it by default. PyTorch currently uses its own vendored version
 173   # of oneDNN through Intel iDeep.
 174   USE_MKLDNN = setBool mklDnnSupport;
 175   USE_MKLDNN_CBLAS = setBool mklDnnSupport;
 176
 177   # Avoid using pybind11 from git submodule
 178   # Also avoids pytorch exporting the headers of pybind11
 179   USE_SYSTEM_BIND11 = true;
 180
 181   preBuild = ''
 182     export MAX_JOBS=$NIX_BUILD_CORES
 183     ${python.interpreter} setup.py build --cmake-only
 184     ${cmake}/bin/cmake build
 185   '';
 186
 187   preFixup = ''
 188     function join_by { local IFS="$1"; shift; echo "$*"; }
 189     function strip2 {
 190       IFS=':'
 191       read -ra RP <<< $(patchelf --print-rpath $1)
 192       IFS=' '
 193       RP_NEW=$(join_by : ''${RP[@]:2})
 194       patchelf --set-rpath \$ORIGIN:''${RP_NEW} "$1"
 195     }
 196     for f in $(find ''${out} -name 'libcaffe2*.so')
 197     do
 198       strip2 $f
 199     done
 200   '';
 201
 202   # Override the (weirdly) wrong version set by default. See
 203   # https://github.com/NixOS/nixpkgs/pull/52437#issuecomment-449718038
 204   # https://github.com/pytorch/pytorch/blob/v1.0.0/setup.py#L267
 205   PYTORCH_BUILD_VERSION = version;
 206   PYTORCH_BUILD_NUMBER = 0;
 207
 208   USE_SYSTEM_NCCL = setBool useSystemNccl;                  # don't build pytorch's third_party NCCL
 209
 210   # Suppress a weird warning in mkl-dnn, part of ideep in pytorch
 211   # (upstream seems to have fixed this in the wrong place?)
 212   # https://github.com/intel/mkl-dnn/commit/8134d346cdb7fe1695a2aa55771071d455fae0bc
 213   # https://github.com/pytorch/pytorch/issues/22346
 214   #
 215   # Also of interest: pytorch ignores CXXFLAGS uses CFLAGS for both C and C++:
 216   # https://github.com/pytorch/pytorch/blob/v1.11.0/setup.py#L17
 217   NIX_CFLAGS_COMPILE = lib.optionals (blas.implementation == "mkl") [ "-Wno-error=array-bounds" ];
 218
 219   nativeBuildInputs = [
 220     cmake
 221     util-linux
 222     which
 223     ninja
 224     pybind11
 225     removeReferencesTo
 226   ] ++ lib.optionals cudaSupport [ cudatoolkit_joined ];
 227
 228   buildInputs = [ blas blas.provider pybind11 ]
 229     ++ [ linuxHeaders_5_19 ] # TMP: avoid "flexible array member" errors for now
 230     ++ lib.optionals cudaSupport [ cudnn magma nccl ]
 231     ++ lib.optionals stdenv.isLinux [ numactl ]
 232     ++ lib.optionals stdenv.isDarwin [ CoreServices libobjc ];
 233
 234   propagatedBuildInputs = [
 235     cffi
 236     click
 237     numpy
 238     pyyaml
 239     typing-extensions
 240     # the following are required for tensorboard support
 241     pillow six future tensorboard protobuf
 242   ] ++ lib.optionals MPISupport [ mpi ];
 243
 244   # Tests take a long time and may be flaky, so just sanity-check imports
 245   doCheck = false;
 246
 247   pythonImportsCheck = [
 248     "torch"
 249   ];
 250
 251   checkInputs = [ hypothesis ninja psutil ];
 252
 253   checkPhase = with lib.versions; with lib.strings; concatStringsSep " " [
 254     "runHook preCheck"
 255     cudaStubEnv
 256     "${python.interpreter} test/run_test.py"
 257     "--exclude"
 258     (concatStringsSep " " [
 259       "utils" # utils requires git, which is not allowed in the check phase
 260
 261       # "dataloader" # psutils correctly finds and triggers multiprocessing, but is too sandboxed to run -- resulting in numerous errors
 262       # ^^^^^^^^^^^^ NOTE: while test_dataloader does return errors, these are acceptable errors and do not interfere with the build
 263
 264       # tensorboard has acceptable failures for pytorch 1.3.x due to dependencies on tensorboard-plugins
 265       (optionalString (majorMinor version == "1.3" ) "tensorboard")
 266     ])
 267     "runHook postCheck"
 268   ];
 269
 270   postInstall = ''
 271     find "$out/${python.sitePackages}/torch/include" "$out/${python.sitePackages}/torch/lib" -type f -exec remove-references-to -t ${stdenv.cc} '{}' +
 272
 273     mkdir $dev
 274     cp -r $out/${python.sitePackages}/torch/include $dev/include
 275     cp -r $out/${python.sitePackages}/torch/share $dev/share
 276
 277     # Fix up library paths for split outputs
 278     substituteInPlace \
 279       $dev/share/cmake/Torch/TorchConfig.cmake \
 280       --replace \''${TORCH_INSTALL_PREFIX}/lib "$lib/lib"
 281
 282     substituteInPlace \
 283       $dev/share/cmake/Caffe2/Caffe2Targets-release.cmake \
 284       --replace \''${_IMPORT_PREFIX}/lib "$lib/lib"
 285
 286     mkdir $lib
 287     mv $out/${python.sitePackages}/torch/lib $lib/lib
 288     ln -s $lib/lib $out/${python.sitePackages}/torch/lib
 289   '';
 290
 291   postFixup = lib.optionalString stdenv.isDarwin ''
 292     for f in $(ls $lib/lib/*.dylib); do
 293         install_name_tool -id $lib/lib/$(basename $f) $f || true
 294     done
 295
 296     install_name_tool -change @rpath/libshm.dylib $lib/lib/libshm.dylib $lib/lib/libtorch_python.dylib
 297     install_name_tool -change @rpath/libtorch.dylib $lib/lib/libtorch.dylib $lib/lib/libtorch_python.dylib
 298     install_name_tool -change @rpath/libc10.dylib $lib/lib/libc10.dylib $lib/lib/libtorch_python.dylib
 299
 300     install_name_tool -change @rpath/libc10.dylib $lib/lib/libc10.dylib $lib/lib/libtorch.dylib
 301
 302     install_name_tool -change @rpath/libtorch.dylib $lib/lib/libtorch.dylib $lib/lib/libshm.dylib
 303     install_name_tool -change @rpath/libc10.dylib $lib/lib/libc10.dylib $lib/lib/libshm.dylib
 304   '';
 305
 306   # Builds in 2+h with 2 cores, and ~15m with a big-parallel builder.
 307   requiredSystemFeatures = [ "big-parallel" ];
 308
 309   passthru = {
 310     inherit cudaSupport cudaPackages;
 311     cudaArchList = final_cudaArchList;
 312     # At least for 1.10.2 `torch.fft` is unavailable unless BLAS provider is MKL. This attribute allows for easy detection of its availability.
 313     blasProvider = blas.provider;
 314   };
 315
 316   meta = with lib; {
 317     changelog = "https://github.com/pytorch/pytorch/releases/tag/v${version}";
 318     # keep PyTorch in the description so the package can be found under that name on search.nixos.org
 319     description = "PyTorch: Tensors and Dynamic neural networks in Python with strong GPU acceleration";
 320     homepage = "https://pytorch.org/";
 321     license = licenses.bsd3;
 322     maintainers = with maintainers; [ teh thoughtpolice tscholak ]; # tscholak esp. for darwin-related builds
 323     platforms = with platforms; linux ++ lib.optionals (!cudaSupport) darwin;
 324   };
 325 }