Merge pull request #11870 from lancelotsix/improve_slurm_service
Improve slurm service configuration
This commit is contained in:
commit
7e14e28a80
|
@ -34,6 +34,15 @@ in
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
package = mkOption {
|
||||||
|
type = types.package;
|
||||||
|
default = pkgs.slurm-llnl;
|
||||||
|
example = literalExample "pkgs.slurm-llnl-full";
|
||||||
|
description = ''
|
||||||
|
The packge to use for slurm binaries.
|
||||||
|
'';
|
||||||
|
};
|
||||||
|
|
||||||
controlMachine = mkOption {
|
controlMachine = mkOption {
|
||||||
type = types.nullOr types.str;
|
type = types.nullOr types.str;
|
||||||
default = null;
|
default = null;
|
||||||
|
@ -91,38 +100,69 @@ in
|
||||||
|
|
||||||
###### implementation
|
###### implementation
|
||||||
|
|
||||||
config = mkIf (cfg.client.enable || cfg.server.enable) {
|
config =
|
||||||
|
let
|
||||||
|
wrappedSlurm = pkgs.stdenv.mkDerivation {
|
||||||
|
name = "wrappedSlurm";
|
||||||
|
|
||||||
environment.systemPackages = [ pkgs.slurm-llnl ];
|
propagatedBuildInputs = [ cfg.package configFile ];
|
||||||
|
|
||||||
|
builder = pkgs.writeText "builder.sh" ''
|
||||||
|
source $stdenv/setup
|
||||||
|
mkdir -p $out/bin
|
||||||
|
find ${cfg.package}/bin -type f -executable | while read EXE
|
||||||
|
do
|
||||||
|
exename="$(basename $EXE)"
|
||||||
|
wrappername="$out/bin/$exename"
|
||||||
|
cat > "$wrappername" <<EOT
|
||||||
|
#!/bin/sh
|
||||||
|
if [ -z "$SLURM_CONF" ]
|
||||||
|
then
|
||||||
|
SLURM_CONF="${configFile}" "$EXE" "\$@"
|
||||||
|
else
|
||||||
|
"$EXE" "\$0"
|
||||||
|
fi
|
||||||
|
EOT
|
||||||
|
chmod +x "$wrappername"
|
||||||
|
done
|
||||||
|
'';
|
||||||
|
};
|
||||||
|
|
||||||
|
in mkIf (cfg.client.enable || cfg.server.enable) {
|
||||||
|
|
||||||
|
environment.systemPackages = [ wrappedSlurm ];
|
||||||
|
|
||||||
systemd.services.slurmd = mkIf (cfg.client.enable) {
|
systemd.services.slurmd = mkIf (cfg.client.enable) {
|
||||||
path = with pkgs; [ slurm-llnl coreutils ];
|
path = with pkgs; [ wrappedSlurm coreutils ];
|
||||||
|
|
||||||
wantedBy = [ "multi-user.target" ];
|
wantedBy = [ "multi-user.target" ];
|
||||||
after = [ "systemd-tmpfiles-clean.service" ];
|
after = [ "systemd-tmpfiles-clean.service" ];
|
||||||
|
|
||||||
serviceConfig = {
|
serviceConfig = {
|
||||||
Type = "forking";
|
Type = "forking";
|
||||||
ExecStart = "${pkgs.slurm-llnl}/bin/slurmd -f ${configFile}";
|
ExecStart = "${wrappedSlurm}/bin/slurmd";
|
||||||
PIDFile = "/run/slurmd.pid";
|
PIDFile = "/run/slurmd.pid";
|
||||||
ExecReload = "${pkgs.coreutils}/bin/kill -HUP $MAINPID";
|
ExecReload = "${pkgs.coreutils}/bin/kill -HUP $MAINPID";
|
||||||
};
|
};
|
||||||
|
|
||||||
|
preStart = ''
|
||||||
|
mkdir -p /var/spool
|
||||||
|
'';
|
||||||
};
|
};
|
||||||
|
|
||||||
systemd.services.slurmctld = mkIf (cfg.server.enable) {
|
systemd.services.slurmctld = mkIf (cfg.server.enable) {
|
||||||
path = with pkgs; [ slurm-llnl munge coreutils ];
|
path = with pkgs; [ wrappedSlurm munge coreutils ];
|
||||||
|
|
||||||
wantedBy = [ "multi-user.target" ];
|
wantedBy = [ "multi-user.target" ];
|
||||||
after = [ "network.target" "auditd.service" "munged.service" "slurmdbd.service" ];
|
after = [ "network.target" "munged.service" ];
|
||||||
requires = [ "munged.service" ];
|
requires = [ "munged.service" ];
|
||||||
|
|
||||||
serviceConfig = {
|
serviceConfig = {
|
||||||
Type = "forking";
|
Type = "forking";
|
||||||
ExecStart = "${pkgs.slurm-llnl}/bin/slurmctld";
|
ExecStart = "${wrappedSlurm}/bin/slurmctld";
|
||||||
PIDFile = "/run/slurmctld.pid";
|
PIDFile = "/run/slurmctld.pid";
|
||||||
ExecReload = "${pkgs.coreutils}/bin/kill -HUP $MAINPID";
|
ExecReload = "${pkgs.coreutils}/bin/kill -HUP $MAINPID";
|
||||||
};
|
};
|
||||||
environment = { SLURM_CONF = "${configFile}"; };
|
|
||||||
};
|
};
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
|
@ -0,0 +1,80 @@
|
||||||
|
import ./make-test.nix ({ pkgs, ... }:
|
||||||
|
let mungekey = "mungeverryweakkeybuteasytointegratoinatest";
|
||||||
|
slurmconfig = {
|
||||||
|
client.enable = true;
|
||||||
|
controlMachine = "control";
|
||||||
|
nodeName = ''
|
||||||
|
control
|
||||||
|
NodeName=node[1-3] CPUs=1 State=UNKNOWN
|
||||||
|
'';
|
||||||
|
partitionName = "debug Nodes=node[1-3] Default=YES MaxTime=INFINITE State=UP";
|
||||||
|
};
|
||||||
|
in {
|
||||||
|
name = "slurm";
|
||||||
|
|
||||||
|
nodes =
|
||||||
|
let
|
||||||
|
computeNode =
|
||||||
|
{ config, pkgs, ...}:
|
||||||
|
{
|
||||||
|
# TODO slrumd port and slurmctld port should be configurations and
|
||||||
|
# automatically allowed by the firewall.
|
||||||
|
networking.firewall.enable = false;
|
||||||
|
services.munge.enable = true;
|
||||||
|
services.slurm = slurmconfig;
|
||||||
|
};
|
||||||
|
in {
|
||||||
|
control =
|
||||||
|
{ config, pkgs, ...}:
|
||||||
|
{
|
||||||
|
networking.firewall.enable = false;
|
||||||
|
services.munge.enable = true;
|
||||||
|
services.slurm = {
|
||||||
|
server.enable = true;
|
||||||
|
} // slurmconfig;
|
||||||
|
};
|
||||||
|
node1 = computeNode;
|
||||||
|
node2 = computeNode;
|
||||||
|
node3 = computeNode;
|
||||||
|
};
|
||||||
|
|
||||||
|
testScript =
|
||||||
|
''
|
||||||
|
startAll;
|
||||||
|
|
||||||
|
# Set up authentification across the cluster
|
||||||
|
foreach my $node (($control,$node1,$node2,$node3))
|
||||||
|
{
|
||||||
|
$node->waitForUnit("default.target");
|
||||||
|
|
||||||
|
$node->succeed("mkdir /etc/munge");
|
||||||
|
$node->succeed("echo '${mungekey}' > /etc/munge/munge.key");
|
||||||
|
$node->succeed("chmod 0400 /etc/munge/munge.key");
|
||||||
|
$node->succeed("systemctl restart munged");
|
||||||
|
}
|
||||||
|
|
||||||
|
# Restart the services since they have probably failed due to the munge init
|
||||||
|
# failure
|
||||||
|
|
||||||
|
subtest "can_start_slurmctld", sub {
|
||||||
|
$control->succeed("systemctl restart slurmctld");
|
||||||
|
$control->waitForUnit("slurmctld.service");
|
||||||
|
};
|
||||||
|
|
||||||
|
subtest "can_start_slurmd", sub {
|
||||||
|
foreach my $node (($control,$node1,$node2,$node3))
|
||||||
|
{
|
||||||
|
$node->succeed("systemctl restart slurmd.service");
|
||||||
|
$node->waitForUnit("slurmd");
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
# Test that the cluster work and can distribute jobs;
|
||||||
|
|
||||||
|
subtest "run_distributed_command", sub {
|
||||||
|
# Run `hostname` on 3 nodes of the partition (so on all the 3 nodes).
|
||||||
|
# The output must contain the 3 different names
|
||||||
|
$control->succeed("srun -N 3 hostname | sort | uniq | wc -l | xargs test 3 -eq");
|
||||||
|
};
|
||||||
|
'';
|
||||||
|
})
|
Loading…
Reference in New Issue