1 import ./make-test-python.nix ({ lib, pkgs, ... }:
5 controlMachine = "control";
6 nodeName = [ "node[1-3] CPUs=1 State=UNKNOWN" ];
7 partitionName = [ "debug Nodes=node[1-3] Default=YES MaxTime=INFINITE State=UP" ];
9 AccountingStorageHost=dbd
10 AccountingStorageType=accounting_storage/slurmdbd
13 environment.systemPackages = [ mpitest ];
14 networking.firewall.enable = false;
15 systemd.tmpfiles.rules = [
16 "f /etc/munge/munge.key 0400 munge munge - mungeverryweakkeybuteasytointegratoinatest"
21 mpitestC = pkgs.writeText "mpitest.c" ''
27 main (int argc, char *argv[])
29 int rank, size, length;
32 MPI_Init (&argc, &argv);
33 MPI_Comm_rank (MPI_COMM_WORLD, &rank);
34 MPI_Comm_size (MPI_COMM_WORLD, &size);
35 MPI_Get_processor_name (name, &length);
37 if ( rank == 0 ) printf("size=%d\n", size);
39 printf ("%s: hello world from process %d of %d\n", name, rank, size);
46 in pkgs.runCommand "mpitest" {} ''
48 ${lib.getDev pkgs.mpi}/bin/mpicc ${mpitestC} -o $out/bin/mpitest
53 meta.maintainers = [ lib.maintainers.markuskowa ];
60 imports = [ slurmconfig ];
61 # TODO slurmd port and slurmctld port should be configurations and
62 # automatically allowed by the firewall.
72 imports = [ slurmconfig ];
81 imports = [ slurmconfig ];
90 passFile = pkgs.writeText "dbdpassword" "password123";
92 networking.firewall.enable = false;
93 systemd.tmpfiles.rules = [
94 "f /etc/munge/munge.key 0400 munge munge - mungeverryweakkeybuteasytointegratoinatest"
96 services.slurm.dbdserver = {
98 storagePassFile = "${passFile}";
102 package = pkgs.mariadb;
103 initialScript = pkgs.writeText "mysql-init.sql" ''
104 CREATE USER 'slurm'@'localhost' IDENTIFIED BY 'password123';
105 GRANT ALL PRIVILEGES ON slurm_acct_db.* TO 'slurm'@'localhost';
107 ensureDatabases = [ "slurm_acct_db" ];
109 ensurePermissions = { "slurm_acct_db.*" = "ALL PRIVILEGES"; };
113 # recommendations from: https://slurm.schedmd.com/accounting.html#mysql-configuration
114 innodb_buffer_pool_size="1024M";
115 innodb_log_file_size="64M";
116 innodb_lock_wait_timeout=900;
131 # Make sure DBD is up after DB initialzation
132 with subtest("can_start_slurmdbd"):
133 dbd.succeed("systemctl restart slurmdbd")
134 dbd.wait_for_unit("slurmdbd.service")
135 dbd.wait_for_open_port(6819)
137 # there needs to be an entry for the current
138 # cluster in the database before slurmctld is restarted
139 with subtest("add_account"):
140 control.succeed("sacctmgr -i add cluster default")
141 # check for cluster entry
142 control.succeed("sacctmgr list cluster | awk '{ print $1 }' | grep default")
144 with subtest("can_start_slurmctld"):
145 control.succeed("systemctl restart slurmctld")
146 control.wait_for_unit("slurmctld.service")
148 with subtest("can_start_slurmd"):
149 for node in [node1, node2, node3]:
150 node.succeed("systemctl restart slurmd.service")
151 node.wait_for_unit("slurmd")
153 # Test that the cluster works and can distribute jobs;
155 with subtest("run_distributed_command"):
156 # Run `hostname` on 3 nodes of the partition (so on all the 3 nodes).
157 # The output must contain the 3 different names
158 submit.succeed("srun -N 3 hostname | sort | uniq | wc -l | xargs test 3 -eq")
160 with subtest("check_slurm_dbd"):
161 # find the srun job from above in the database
162 control.succeed("sleep 5")
163 control.succeed("sacct | grep hostname")
165 with subtest("run_PMIx_mpitest"):
166 submit.succeed("srun -N 3 --mpi=pmix mpitest | grep size=3")