pkgs/applications/networking/cluster/hadoop/default.nix

   1 {
   2   lib,
   3   stdenv,
   4   fetchurl,
   5   makeWrapper,
   6   autoPatchelfHook,
   7   jdk8_headless,
   8   jdk11_headless,
   9   bash,
  10   coreutils,
  11   which,
  12   bzip2,
  13   cyrus_sasl,
  14   protobuf,
  15   snappy,
  16   zlib,
  17   zstd,
  18   openssl,
  19   nixosTests,
  20   sparkSupport ? true,
  21   spark,
  22   libtirpc,
  23   callPackage,
  24 }:
  25
  26 assert lib.elem stdenv.system [
  27   "x86_64-linux"
  28   "x86_64-darwin"
  29   "aarch64-linux"
  30   "aarch64-darwin"
  31 ];
  32
  33 let
  34   common =
  35     {
  36       pname,
  37       platformAttrs,
  38       jdk,
  39       tests,
  40     }:
  41     stdenv.mkDerivation (finalAttrs: {
  42       inherit pname jdk;
  43       version = platformAttrs.${stdenv.system}.version or (throw "Unsupported system: ${stdenv.system}");
  44       src = fetchurl {
  45         url =
  46           "mirror://apache/hadoop/common/hadoop-${finalAttrs.version}/hadoop-${finalAttrs.version}"
  47           + lib.optionalString stdenv.hostPlatform.isAarch64 "-aarch64"
  48           + ".tar.gz";
  49         inherit (platformAttrs.${stdenv.system} or (throw "Unsupported system: ${stdenv.system}"))
  50           hash
  51           ;
  52       };
  53       doCheck = true;
  54
  55       # Build the container executor binary from source
  56       # InstallPhase is not lazily evaluating containerExecutor for some reason
  57       containerExecutor =
  58         if stdenv.hostPlatform.isLinux then
  59           (callPackage ./containerExecutor.nix {
  60             inherit (finalAttrs) version;
  61             inherit platformAttrs;
  62           })
  63         else
  64           "";
  65
  66       nativeBuildInputs = [
  67         makeWrapper
  68       ] ++ lib.optionals stdenv.hostPlatform.isLinux [ autoPatchelfHook ];
  69       buildInputs = lib.optionals stdenv.hostPlatform.isLinux [
  70         (lib.getLib stdenv.cc.cc)
  71         openssl
  72         protobuf
  73         zlib
  74         snappy
  75         libtirpc
  76       ];
  77
  78       installPhase =
  79         ''
  80           mkdir $out
  81           mv * $out/
  82         ''
  83         + lib.optionalString stdenv.hostPlatform.isLinux ''
  84           for n in $(find ${finalAttrs.containerExecutor}/bin -type f); do
  85             ln -sf "$n" $out/bin
  86           done
  87
  88           # these libraries are loaded at runtime by the JVM
  89           ln -s ${lib.getLib cyrus_sasl}/lib/libsasl2.so $out/lib/native/libsasl2.so.2
  90           ln -s ${lib.getLib openssl}/lib/libcrypto.so $out/lib/native/
  91           ln -s ${lib.getLib zlib}/lib/libz.so.1 $out/lib/native/
  92           ln -s ${lib.getLib zstd}/lib/libzstd.so.1 $out/lib/native/
  93           ln -s ${lib.getLib bzip2}/lib/libbz2.so.1 $out/lib/native/
  94           ln -s ${lib.getLib snappy}/lib/libsnappy.so.1 $out/lib/native/
  95
  96           # libjvm.so is in different paths for java 8 and 11
  97           # libnativetask.so in hadooop 3 and libhdfs.so in hadoop 2 depend on it
  98           find $out/lib/native/ -name 'libnativetask.so*' -o -name 'libhdfs.so*' | \
  99             xargs -n1 patchelf --add-rpath $(dirname $(find ${finalAttrs.jdk.home} -name libjvm.so | head -n1))
 100
 101           # NixOS/nixpkgs#193370
 102           # This workaround is needed to use protobuf 3.19
 103           # hadoop 3.3+ depends on protobuf 3.18, 3.2 depends on 3.8
 104           find $out/lib/native -name 'libhdfspp.so*' | \
 105             xargs -r -n1 patchelf --replace-needed libprotobuf.so.${
 106               if (lib.versionAtLeast finalAttrs.version "3.4.1") then
 107                 "32"
 108               else if (lib.versionAtLeast finalAttrs.version "3.3") then
 109                 "18"
 110               else
 111                 "8"
 112             } libprotobuf.so
 113
 114           patchelf --replace-needed libcrypto.so.1.1 libcrypto.so \
 115             $out/lib/native/{libhdfs{pp,}.so*,examples/{pipes-sort,wordcount-nopipe,wordcount-part,wordcount-simple}}
 116
 117         ''
 118         + ''
 119           for n in $(find $out/bin -type f ! -name "*.*"); do
 120             wrapProgram "$n"\
 121               --set-default JAVA_HOME ${finalAttrs.jdk.home}\
 122               --set-default HADOOP_HOME $out/\
 123               --run "test -d /etc/hadoop-conf && export HADOOP_CONF_DIR=\''${HADOOP_CONF_DIR-'/etc/hadoop-conf/'}"\
 124               --set-default HADOOP_CONF_DIR $out/etc/hadoop/\
 125               --prefix PATH : "${
 126                 lib.makeBinPath [
 127                   bash
 128                   coreutils
 129                   which
 130                 ]
 131               }"\
 132               --prefix JAVA_LIBRARY_PATH : "${lib.makeLibraryPath finalAttrs.buildInputs}"
 133           done
 134         ''
 135         + (lib.optionalString sparkSupport ''
 136           # Add the spark shuffle service jar to YARN
 137           cp ${spark.src}/yarn/spark-${spark.version}-yarn-shuffle.jar $out/share/hadoop/yarn/
 138         '');
 139
 140       passthru = { inherit tests; };
 141
 142       meta =
 143         with lib;
 144         recursiveUpdate {
 145           homepage = "https://hadoop.apache.org/";
 146           description = "Framework for distributed processing of large data sets across clusters of computers";
 147           license = licenses.asl20;
 148           sourceProvenance = with sourceTypes; [ binaryBytecode ];
 149
 150           longDescription = ''
 151             The Apache Hadoop software library is a framework that allows for
 152             the distributed processing of large data sets across clusters of
 153             computers using a simple programming model. It is designed to
 154             scale up from single servers to thousands of machines, each
 155             offering local computation and storage. Rather than rely on
 156             hardware to deliver high-avaiability, the library itself is
 157             designed to detect and handle failures at the application layer,
 158             so delivering a highly-availabile service on top of a cluster of
 159             computers, each of which may be prone to failures.
 160           '';
 161           maintainers = with maintainers; [ illustris ];
 162           platforms = attrNames platformAttrs;
 163         } (attrByPath [ stdenv.system "meta" ] { } platformAttrs);
 164     });
 165 in
 166 {
 167   # Different version of hadoop support different java runtime versions
 168   # https://cwiki.apache.org/confluence/display/HADOOP/Hadoop+Java+Versions
 169   hadoop_3_4 = common rec {
 170     pname = "hadoop";
 171     platformAttrs = rec {
 172       x86_64-linux = {
 173         version = "3.4.1";
 174         hash = "sha256-mtVIeDOZbf5VFOdW9DkQKckFKf0i6NAC/T3QwUwEukY=";
 175         srcHash = "sha256-lE9uSohy6GWXprFEYbEin2ITqTms2h6EWXe4nEd3U4Y=";
 176       };
 177       x86_64-darwin = x86_64-linux;
 178       aarch64-linux = x86_64-linux // {
 179         version = "3.4.0";
 180         hash = "sha256-QWxzKtNyw/AzcHMv0v7kj91pw1HO7VAN9MHO84caFk8=";
 181         srcHash = "sha256-viDF3LdRCZHqFycOYfN7nUQBPHiMCIjmu7jgIAaaK9E=";
 182       };
 183       aarch64-darwin = aarch64-linux;
 184     };
 185     jdk = jdk11_headless;
 186     # TODO: Package and add Intel Storage Acceleration Library
 187     tests = nixosTests.hadoop;
 188   };
 189   hadoop_3_3 = common rec {
 190     pname = "hadoop";
 191     platformAttrs = rec {
 192       x86_64-linux = {
 193         version = "3.3.6";
 194         hash = "sha256-9RlQWcDUECrap//xf3sqhd+Qa8tuGZSHFjGfmXhkGgQ=";
 195         srcHash = "sha256-4OEsVhBNV9CJ+PN4FgCduUCVA9/el5yezSCZ6ko3+bU=";
 196       };
 197       x86_64-darwin = x86_64-linux;
 198       aarch64-linux = x86_64-linux // {
 199         hash = "sha256-5Lv2uA72BJEva5v2yncyPe5gKNCNOPNsoHffVt6KXQ0=";
 200       };
 201       aarch64-darwin = aarch64-linux;
 202     };
 203     jdk = jdk11_headless;
 204     # TODO: Package and add Intel Storage Acceleration Library
 205     tests = nixosTests.hadoop_3_3;
 206   };
 207   hadoop2 = common rec {
 208     pname = "hadoop";
 209     platformAttrs.x86_64-linux = {
 210       version = "2.10.2";
 211       hash = "sha256-xhA4zxqIRGNhIeBnJO9dLKf/gx/Bq+uIyyZwsIafEyo=";
 212       srcHash = "sha256-ucxCyXiJo8aL6aNMhZgKEbn8sGKOoMPVREbMGSfSdAI=";
 213     };
 214     jdk = jdk8_headless;
 215     tests = nixosTests.hadoop2;
 216   };
 217 }