Merge pull request #11870 from lancelotsix/improve_slurm_service
Improve slurm service configuration
This commit is contained in:
		
						commit
						7e14e28a80
					
				@ -34,6 +34,15 @@ in
 | 
			
		||||
 | 
			
		||||
      };
 | 
			
		||||
 | 
			
		||||
      package = mkOption {
 | 
			
		||||
        type = types.package;
 | 
			
		||||
        default = pkgs.slurm-llnl;
 | 
			
		||||
        example = literalExample "pkgs.slurm-llnl-full";
 | 
			
		||||
        description = ''
 | 
			
		||||
          The packge to use for slurm binaries.
 | 
			
		||||
        '';
 | 
			
		||||
      };
 | 
			
		||||
 | 
			
		||||
      controlMachine = mkOption {
 | 
			
		||||
        type = types.nullOr types.str;
 | 
			
		||||
        default = null;
 | 
			
		||||
@ -91,38 +100,69 @@ in
 | 
			
		||||
 | 
			
		||||
  ###### implementation
 | 
			
		||||
 | 
			
		||||
  config = mkIf (cfg.client.enable || cfg.server.enable) {
 | 
			
		||||
  config =
 | 
			
		||||
    let
 | 
			
		||||
      wrappedSlurm = pkgs.stdenv.mkDerivation {
 | 
			
		||||
        name = "wrappedSlurm";
 | 
			
		||||
 | 
			
		||||
    environment.systemPackages = [ pkgs.slurm-llnl ];
 | 
			
		||||
        propagatedBuildInputs = [ cfg.package configFile ];
 | 
			
		||||
 | 
			
		||||
        builder = pkgs.writeText "builder.sh" ''
 | 
			
		||||
          source $stdenv/setup
 | 
			
		||||
          mkdir -p $out/bin
 | 
			
		||||
          find  ${cfg.package}/bin -type f -executable | while read EXE
 | 
			
		||||
          do
 | 
			
		||||
            exename="$(basename $EXE)"
 | 
			
		||||
            wrappername="$out/bin/$exename"
 | 
			
		||||
            cat > "$wrappername" <<EOT
 | 
			
		||||
          #!/bin/sh
 | 
			
		||||
          if [ -z "$SLURM_CONF" ]
 | 
			
		||||
          then
 | 
			
		||||
            SLURM_CONF="${configFile}" "$EXE" "\$@"
 | 
			
		||||
          else
 | 
			
		||||
            "$EXE" "\$0"
 | 
			
		||||
          fi
 | 
			
		||||
          EOT
 | 
			
		||||
            chmod +x "$wrappername"
 | 
			
		||||
          done
 | 
			
		||||
        '';
 | 
			
		||||
      };
 | 
			
		||||
 | 
			
		||||
  in mkIf (cfg.client.enable || cfg.server.enable) {
 | 
			
		||||
 | 
			
		||||
    environment.systemPackages = [ wrappedSlurm ];
 | 
			
		||||
 | 
			
		||||
    systemd.services.slurmd = mkIf (cfg.client.enable) {
 | 
			
		||||
      path = with pkgs; [ slurm-llnl coreutils ];
 | 
			
		||||
      path = with pkgs; [ wrappedSlurm coreutils ];
 | 
			
		||||
 | 
			
		||||
      wantedBy = [ "multi-user.target" ];
 | 
			
		||||
      after = [ "systemd-tmpfiles-clean.service" ];
 | 
			
		||||
 | 
			
		||||
      serviceConfig = {
 | 
			
		||||
        Type = "forking";
 | 
			
		||||
        ExecStart = "${pkgs.slurm-llnl}/bin/slurmd -f ${configFile}";
 | 
			
		||||
        ExecStart = "${wrappedSlurm}/bin/slurmd";
 | 
			
		||||
        PIDFile = "/run/slurmd.pid";
 | 
			
		||||
        ExecReload = "${pkgs.coreutils}/bin/kill -HUP $MAINPID";
 | 
			
		||||
      };
 | 
			
		||||
 | 
			
		||||
      preStart = ''
 | 
			
		||||
        mkdir -p /var/spool
 | 
			
		||||
      '';
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
    systemd.services.slurmctld = mkIf (cfg.server.enable) {
 | 
			
		||||
      path = with pkgs; [ slurm-llnl munge coreutils ];
 | 
			
		||||
      path = with pkgs; [ wrappedSlurm munge coreutils ];
 | 
			
		||||
      
 | 
			
		||||
      wantedBy = [ "multi-user.target" ];
 | 
			
		||||
      after = [ "network.target" "auditd.service" "munged.service" "slurmdbd.service" ];
 | 
			
		||||
      after = [ "network.target" "munged.service" ];
 | 
			
		||||
      requires = [ "munged.service" ];
 | 
			
		||||
 | 
			
		||||
      serviceConfig = {
 | 
			
		||||
        Type = "forking";
 | 
			
		||||
        ExecStart = "${pkgs.slurm-llnl}/bin/slurmctld";
 | 
			
		||||
        ExecStart = "${wrappedSlurm}/bin/slurmctld";
 | 
			
		||||
        PIDFile = "/run/slurmctld.pid";
 | 
			
		||||
        ExecReload = "${pkgs.coreutils}/bin/kill -HUP $MAINPID";
 | 
			
		||||
      };
 | 
			
		||||
      environment = { SLURM_CONF = "${configFile}"; };
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										80
									
								
								nixos/tests/slurm.nix
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										80
									
								
								nixos/tests/slurm.nix
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,80 @@
 | 
			
		||||
import ./make-test.nix ({ pkgs, ... }:
 | 
			
		||||
let mungekey = "mungeverryweakkeybuteasytointegratoinatest";
 | 
			
		||||
    slurmconfig = {
 | 
			
		||||
      client.enable = true;
 | 
			
		||||
      controlMachine = "control";
 | 
			
		||||
      nodeName = ''
 | 
			
		||||
        control
 | 
			
		||||
        NodeName=node[1-3] CPUs=1 State=UNKNOWN
 | 
			
		||||
      '';
 | 
			
		||||
      partitionName = "debug Nodes=node[1-3] Default=YES MaxTime=INFINITE State=UP";
 | 
			
		||||
    };
 | 
			
		||||
in {
 | 
			
		||||
  name = "slurm";
 | 
			
		||||
 | 
			
		||||
  nodes =
 | 
			
		||||
    let
 | 
			
		||||
    computeNode =
 | 
			
		||||
      { config, pkgs, ...}:
 | 
			
		||||
      {
 | 
			
		||||
        # TODO slrumd port and slurmctld port should be configurations and
 | 
			
		||||
        # automatically allowed by the  firewall.
 | 
			
		||||
        networking.firewall.enable = false;
 | 
			
		||||
        services.munge.enable = true;
 | 
			
		||||
        services.slurm = slurmconfig;
 | 
			
		||||
      };
 | 
			
		||||
    in {
 | 
			
		||||
    control =
 | 
			
		||||
      { config, pkgs, ...}:
 | 
			
		||||
      {
 | 
			
		||||
        networking.firewall.enable = false;
 | 
			
		||||
        services.munge.enable = true;
 | 
			
		||||
        services.slurm = {
 | 
			
		||||
          server.enable = true;
 | 
			
		||||
        } // slurmconfig;
 | 
			
		||||
      };
 | 
			
		||||
    node1 = computeNode;
 | 
			
		||||
    node2 = computeNode;
 | 
			
		||||
    node3 = computeNode;
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  testScript =
 | 
			
		||||
  ''
 | 
			
		||||
  startAll;
 | 
			
		||||
 | 
			
		||||
  # Set up authentification across the cluster
 | 
			
		||||
  foreach my $node (($control,$node1,$node2,$node3))
 | 
			
		||||
  {
 | 
			
		||||
    $node->waitForUnit("default.target");
 | 
			
		||||
 | 
			
		||||
    $node->succeed("mkdir /etc/munge");
 | 
			
		||||
    $node->succeed("echo '${mungekey}' > /etc/munge/munge.key");
 | 
			
		||||
    $node->succeed("chmod 0400 /etc/munge/munge.key");
 | 
			
		||||
    $node->succeed("systemctl restart munged");
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  # Restart the services since they have probably failed due to the munge init
 | 
			
		||||
  # failure
 | 
			
		||||
 | 
			
		||||
  subtest "can_start_slurmctld", sub {
 | 
			
		||||
    $control->succeed("systemctl restart slurmctld");
 | 
			
		||||
    $control->waitForUnit("slurmctld.service");
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  subtest "can_start_slurmd", sub {
 | 
			
		||||
    foreach my $node (($control,$node1,$node2,$node3))
 | 
			
		||||
    {
 | 
			
		||||
      $node->succeed("systemctl restart slurmd.service");
 | 
			
		||||
      $node->waitForUnit("slurmd");
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  # Test that the cluster work and can distribute jobs;
 | 
			
		||||
 | 
			
		||||
  subtest "run_distributed_command", sub {
 | 
			
		||||
    # Run `hostname` on 3 nodes of the partition (so on all the 3 nodes).
 | 
			
		||||
    # The output must contain the 3 different names
 | 
			
		||||
    $control->succeed("srun -N 3 hostname | sort | uniq | wc -l | xargs test 3 -eq");
 | 
			
		||||
  };
 | 
			
		||||
  '';
 | 
			
		||||
})
 | 
			
		||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user