pkgs/applications/networking/cluster/hadoop/default.nix

   1 { lib
   2 , stdenv
   3 , fetchurl
   4 , makeWrapper
   5 , autoPatchelfHook
   6 , jdk8_headless
   7 , jdk11_headless
   8 , bash
   9 , coreutils
  10 , which
  11 , bzip2
  12 , cyrus_sasl
  13 , protobuf
  14 , snappy
  15 , zlib
  16 , zstd
  17 , openssl
  18 , nixosTests
  19 , sparkSupport ? true
  20 , spark
  21 , libtirpc
  22 , callPackage
  23 }:
  24
  25 assert lib.elem stdenv.system [ "x86_64-linux" "x86_64-darwin" "aarch64-linux" "aarch64-darwin" ];
  26
  27 let
  28   common = { pname, platformAttrs, jdk, tests }:
  29     stdenv.mkDerivation (finalAttrs: {
  30       inherit pname jdk;
  31       version = platformAttrs.${stdenv.system}.version or (throw "Unsupported system: ${stdenv.system}");
  32       src = fetchurl {
  33         url = "mirror://apache/hadoop/common/hadoop-${finalAttrs.version}/hadoop-${finalAttrs.version}"
  34               + lib.optionalString stdenv.hostPlatform.isAarch64 "-aarch64" + ".tar.gz";
  35         inherit (platformAttrs.${stdenv.system} or (throw "Unsupported system: ${stdenv.system}")) hash;
  36       };
  37       doCheck = true;
  38
  39       # Build the container executor binary from source
  40       # InstallPhase is not lazily evaluating containerExecutor for some reason
  41       containerExecutor = if stdenv.hostPlatform.isLinux then (callPackage ./containerExecutor.nix {
  42         inherit (finalAttrs) version;
  43         inherit platformAttrs;
  44       }) else "";
  45
  46       nativeBuildInputs = [ makeWrapper ]
  47                           ++ lib.optionals stdenv.hostPlatform.isLinux [ autoPatchelfHook ];
  48       buildInputs = lib.optionals stdenv.hostPlatform.isLinux [ (lib.getLib stdenv.cc.cc) openssl protobuf zlib snappy libtirpc ];
  49
  50       installPhase = ''
  51         mkdir $out
  52         mv * $out/
  53       '' + lib.optionalString stdenv.hostPlatform.isLinux ''
  54         for n in $(find ${finalAttrs.containerExecutor}/bin -type f); do
  55           ln -sf "$n" $out/bin
  56         done
  57
  58         # these libraries are loaded at runtime by the JVM
  59         ln -s ${lib.getLib cyrus_sasl}/lib/libsasl2.so $out/lib/native/libsasl2.so.2
  60         ln -s ${lib.getLib openssl}/lib/libcrypto.so $out/lib/native/
  61         ln -s ${lib.getLib zlib}/lib/libz.so.1 $out/lib/native/
  62         ln -s ${lib.getLib zstd}/lib/libzstd.so.1 $out/lib/native/
  63         ln -s ${lib.getLib bzip2}/lib/libbz2.so.1 $out/lib/native/
  64         ln -s ${lib.getLib snappy}/lib/libsnappy.so.1 $out/lib/native/
  65
  66         # libjvm.so is in different paths for java 8 and 11
  67         # libnativetask.so in hadooop 3 and libhdfs.so in hadoop 2 depend on it
  68         find $out/lib/native/ -name 'libnativetask.so*' -o -name 'libhdfs.so*' | \
  69           xargs -n1 patchelf --add-rpath $(dirname $(find ${finalAttrs.jdk.home} -name libjvm.so | head -n1))
  70
  71         # NixOS/nixpkgs#193370
  72         # This workaround is needed to use protobuf 3.19
  73         # hadoop 3.3+ depends on protobuf 3.18, 3.2 depends on 3.8
  74         find $out/lib/native -name 'libhdfspp.so*' | \
  75           xargs -r -n1 patchelf --replace-needed libprotobuf.so.${
  76             if (lib.versionAtLeast finalAttrs.version "3.3") then "18"
  77             else "8"
  78           } libprotobuf.so
  79
  80         patchelf --replace-needed libcrypto.so.1.1 libcrypto.so \
  81           $out/lib/native/{libhdfs{pp,}.so*,examples/{pipes-sort,wordcount-nopipe,wordcount-part,wordcount-simple}}
  82
  83       '' + ''
  84         for n in $(find $out/bin -type f ! -name "*.*"); do
  85           wrapProgram "$n"\
  86             --set-default JAVA_HOME ${finalAttrs.jdk.home}\
  87             --set-default HADOOP_HOME $out/\
  88             --run "test -d /etc/hadoop-conf && export HADOOP_CONF_DIR=\''${HADOOP_CONF_DIR-'/etc/hadoop-conf/'}"\
  89             --set-default HADOOP_CONF_DIR $out/etc/hadoop/\
  90             --prefix PATH : "${lib.makeBinPath [ bash coreutils which]}"\
  91             --prefix JAVA_LIBRARY_PATH : "${lib.makeLibraryPath finalAttrs.buildInputs}"
  92         done
  93       '' + (lib.optionalString sparkSupport ''
  94         # Add the spark shuffle service jar to YARN
  95         cp ${spark.src}/yarn/spark-${spark.version}-yarn-shuffle.jar $out/share/hadoop/yarn/
  96       '');
  97
  98       passthru = { inherit tests; };
  99
 100       meta = with lib; recursiveUpdate {
 101         homepage = "https://hadoop.apache.org/";
 102         description = "Framework for distributed processing of large data sets across clusters of computers";
 103         license = licenses.asl20;
 104         sourceProvenance = with sourceTypes; [ binaryBytecode ];
 105
 106         longDescription = ''
 107           The Apache Hadoop software library is a framework that allows for
 108           the distributed processing of large data sets across clusters of
 109           computers using a simple programming model. It is designed to
 110           scale up from single servers to thousands of machines, each
 111           offering local computation and storage. Rather than rely on
 112           hardware to deliver high-avaiability, the library itself is
 113           designed to detect and handle failures at the application layer,
 114           so delivering a highly-availabile service on top of a cluster of
 115           computers, each of which may be prone to failures.
 116         '';
 117         maintainers = with maintainers; [ illustris ];
 118         platforms = attrNames platformAttrs;
 119       } (attrByPath [ stdenv.system "meta" ] {} platformAttrs);
 120     });
 121 in
 122 {
 123   # Different version of hadoop support different java runtime versions
 124   # https://cwiki.apache.org/confluence/display/HADOOP/Hadoop+Java+Versions
 125   hadoop_3_4 = common rec {
 126     pname = "hadoop";
 127     platformAttrs = rec {
 128       x86_64-linux = {
 129         version = "3.4.0";
 130         hash = "sha256-4xGnhIBBQDD57GNUml1oXmnibyBxA9mr8hpIud0DyGw=";
 131         srcHash = "sha256-viDF3LdRCZHqFycOYfN7nUQBPHiMCIjmu7jgIAaaK9E=";
 132       };
 133       x86_64-darwin = x86_64-linux;
 134       aarch64-linux = x86_64-linux // {
 135         hash = "sha256-QWxzKtNyw/AzcHMv0v7kj91pw1HO7VAN9MHO84caFk8=";
 136       };
 137       aarch64-darwin = aarch64-linux;
 138     };
 139     jdk = jdk11_headless;
 140     # TODO: Package and add Intel Storage Acceleration Library
 141     tests = nixosTests.hadoop;
 142   };
 143   hadoop_3_3 = common rec {
 144     pname = "hadoop";
 145     platformAttrs = rec {
 146       x86_64-linux = {
 147         version = "3.3.6";
 148         hash = "sha256-9RlQWcDUECrap//xf3sqhd+Qa8tuGZSHFjGfmXhkGgQ=";
 149         srcHash = "sha256-4OEsVhBNV9CJ+PN4FgCduUCVA9/el5yezSCZ6ko3+bU=";
 150       };
 151       x86_64-darwin = x86_64-linux;
 152       aarch64-linux = x86_64-linux // {
 153         hash = "sha256-5Lv2uA72BJEva5v2yncyPe5gKNCNOPNsoHffVt6KXQ0=";
 154       };
 155       aarch64-darwin = aarch64-linux;
 156     };
 157     jdk = jdk11_headless;
 158     # TODO: Package and add Intel Storage Acceleration Library
 159     tests = nixosTests.hadoop_3_3;
 160   };
 161   hadoop2 = common rec {
 162     pname = "hadoop";
 163     platformAttrs.x86_64-linux = {
 164       version = "2.10.2";
 165       hash = "sha256-xhA4zxqIRGNhIeBnJO9dLKf/gx/Bq+uIyyZwsIafEyo=";
 166       srcHash = "sha256-ucxCyXiJo8aL6aNMhZgKEbn8sGKOoMPVREbMGSfSdAI=";
 167     };
 168     jdk = jdk8_headless;
 169     tests = nixosTests.hadoop2;
 170   };
 171 }