nixos/slurm: add pmix to test and cleanup test

* use tmpfiles to create key for munge
* add mpitest source
* add a subtest for PMIx/MPI startup
This commit is contained in:
Markus Kowalewski 2020-07-02 14:27:43 +02:00
parent 93d8aa7da9
commit 61fceac1bb
No known key found for this signature in database
GPG Key ID: D865C8A91D7025EB
1 changed files with 59 additions and 34 deletions

View File

@ -1,16 +1,52 @@
import ./make-test-python.nix ({ lib, ... }: import ./make-test-python.nix ({ lib, pkgs, ... }:
let let
mungekey = "mungeverryweakkeybuteasytointegratoinatest";
slurmconfig = { slurmconfig = {
controlMachine = "control"; services.slurm = {
nodeName = [ "node[1-3] CPUs=1 State=UNKNOWN" ]; controlMachine = "control";
partitionName = [ "debug Nodes=node[1-3] Default=YES MaxTime=INFINITE State=UP" ]; nodeName = [ "node[1-3] CPUs=1 State=UNKNOWN" ];
extraConfig = '' partitionName = [ "debug Nodes=node[1-3] Default=YES MaxTime=INFINITE State=UP" ];
AccountingStorageHost=dbd extraConfig = ''
AccountingStorageType=accounting_storage/slurmdbd AccountingStorageHost=dbd
''; AccountingStorageType=accounting_storage/slurmdbd
'';
};
environment.systemPackages = [ mpitest ];
networking.firewall.enable = false;
systemd.tmpfiles.rules = [
"f /etc/munge/munge.key 0400 munge munge - mungeverryweakkeybuteasytointegratoinatest"
];
}; };
mpitest = let
mpitestC = pkgs.writeText "mpitest.c" ''
#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>
int
main (int argc, char *argv[])
{
int rank, size, length;
char name[512];
MPI_Init (&argc, &argv);
MPI_Comm_rank (MPI_COMM_WORLD, &rank);
MPI_Comm_size (MPI_COMM_WORLD, &size);
MPI_Get_processor_name (name, &length);
if ( rank == 0 ) printf("size=%d\n", size);
printf ("%s: hello world from process %d of %d\n", name, rank, size);
MPI_Finalize ();
return EXIT_SUCCESS;
}
'';
in pkgs.runCommandNoCC "mpitest" {} ''
mkdir -p $out/bin
${pkgs.openmpi}/bin/mpicc ${mpitestC} -o $out/bin/mpitest
'';
in { in {
name = "slurm"; name = "slurm";
@ -21,37 +57,40 @@ in {
computeNode = computeNode =
{ ...}: { ...}:
{ {
imports = [ slurmconfig ];
# TODO slurmd port and slurmctld port should be configurations and # TODO slurmd port and slurmctld port should be configurations and
# automatically allowed by the firewall. # automatically allowed by the firewall.
networking.firewall.enable = false;
services.slurm = { services.slurm = {
client.enable = true; client.enable = true;
} // slurmconfig; };
}; };
in { in {
control = control =
{ ...}: { ...}:
{ {
networking.firewall.enable = false; imports = [ slurmconfig ];
services.slurm = { services.slurm = {
server.enable = true; server.enable = true;
} // slurmconfig; };
}; };
submit = submit =
{ ...}: { ...}:
{ {
networking.firewall.enable = false; imports = [ slurmconfig ];
services.slurm = { services.slurm = {
enableStools = true; enableStools = true;
} // slurmconfig; };
}; };
dbd = dbd =
{ pkgs, ... } : { pkgs, ... } :
{ {
networking.firewall.enable = false; networking.firewall.enable = false;
systemd.tmpfiles.rules = [
"f /etc/munge/munge.key 0400 munge munge - mungeverryweakkeybuteasytointegratoinatest"
];
services.slurm.dbdserver = { services.slurm.dbdserver = {
enable = true; enable = true;
storagePass = "password123"; storagePass = "password123";
@ -87,24 +126,7 @@ in {
'' ''
start_all() start_all()
# Set up authentification across the cluster # Make sure DBD is up after DB initialzation
for node in [submit, control, dbd, node1, node2, node3]:
node.wait_for_unit("default.target")
node.succeed("mkdir /etc/munge")
node.succeed(
"echo '${mungekey}' > /etc/munge/munge.key"
)
node.succeed("chmod 0400 /etc/munge/munge.key")
node.succeed("chown munge:munge /etc/munge/munge.key")
node.succeed("systemctl restart munged")
node.wait_for_unit("munged")
# Restart the services since they have probably failed due to the munge init
# failure
with subtest("can_start_slurmdbd"): with subtest("can_start_slurmdbd"):
dbd.succeed("systemctl restart slurmdbd") dbd.succeed("systemctl restart slurmdbd")
dbd.wait_for_unit("slurmdbd.service") dbd.wait_for_unit("slurmdbd.service")
@ -137,5 +159,8 @@ in {
# find the srun job from above in the database # find the srun job from above in the database
control.succeed("sleep 5") control.succeed("sleep 5")
control.succeed("sacct | grep hostname") control.succeed("sacct | grep hostname")
with subtest("run_PMIx_mpitest"):
submit.succeed("srun -N 3 --mpi=pmix mpitest | grep size=3")
''; '';
}) })