169 lines
		
	
	
		
			4.8 KiB
		
	
	
	
		
			Nix
		
	
	
	
	
	
			
		
		
	
	
			169 lines
		
	
	
		
			4.8 KiB
		
	
	
	
		
			Nix
		
	
	
	
	
	
import ./make-test-python.nix ({ lib, pkgs, ... }:
 | 
						||
let
 | 
						||
    slurmconfig = {
 | 
						||
      services.slurm = {
 | 
						||
        controlMachine = "control";
 | 
						||
        nodeName = [ "node[1-3] CPUs=1 State=UNKNOWN" ];
 | 
						||
        partitionName = [ "debug Nodes=node[1-3] Default=YES MaxTime=INFINITE State=UP" ];
 | 
						||
        extraConfig = ''
 | 
						||
          AccountingStorageHost=dbd
 | 
						||
          AccountingStorageType=accounting_storage/slurmdbd
 | 
						||
        '';
 | 
						||
      };
 | 
						||
      environment.systemPackages = [ mpitest ];
 | 
						||
      networking.firewall.enable = false;
 | 
						||
      systemd.tmpfiles.rules = [
 | 
						||
        "f /etc/munge/munge.key 0400 munge munge - mungeverryweakkeybuteasytointegratoinatest"
 | 
						||
      ];
 | 
						||
    };
 | 
						||
 | 
						||
    mpitest = let
 | 
						||
      mpitestC = pkgs.writeText "mpitest.c" ''
 | 
						||
        #include <stdio.h>
 | 
						||
        #include <stdlib.h>
 | 
						||
        #include <mpi.h>
 | 
						||
 | 
						||
        int
 | 
						||
        main (int argc, char *argv[])
 | 
						||
        {
 | 
						||
          int rank, size, length;
 | 
						||
          char name[512];
 | 
						||
 | 
						||
          MPI_Init (&argc, &argv);
 | 
						||
          MPI_Comm_rank (MPI_COMM_WORLD, &rank);
 | 
						||
          MPI_Comm_size (MPI_COMM_WORLD, &size);
 | 
						||
          MPI_Get_processor_name (name, &length);
 | 
						||
 | 
						||
          if ( rank == 0 ) printf("size=%d\n", size);
 | 
						||
 | 
						||
          printf ("%s: hello world from process %d of %d\n", name, rank, size);
 | 
						||
 | 
						||
          MPI_Finalize ();
 | 
						||
 | 
						||
          return EXIT_SUCCESS;
 | 
						||
        }
 | 
						||
      '';
 | 
						||
    in pkgs.runCommandNoCC "mpitest" {} ''
 | 
						||
      mkdir -p $out/bin
 | 
						||
      ${pkgs.openmpi}/bin/mpicc ${mpitestC} -o $out/bin/mpitest
 | 
						||
    '';
 | 
						||
in {
 | 
						||
  name = "slurm";
 | 
						||
 | 
						||
  meta.maintainers = [ lib.maintainers.markuskowa ];
 | 
						||
 | 
						||
  nodes =
 | 
						||
    let
 | 
						||
    computeNode =
 | 
						||
      { ...}:
 | 
						||
      {
 | 
						||
        imports = [ slurmconfig ];
 | 
						||
        # TODO slurmd port and slurmctld port should be configurations and
 | 
						||
        # automatically allowed by the  firewall.
 | 
						||
        services.slurm = {
 | 
						||
          client.enable = true;
 | 
						||
        };
 | 
						||
      };
 | 
						||
    in {
 | 
						||
 | 
						||
    control =
 | 
						||
      { ...}:
 | 
						||
      {
 | 
						||
        imports = [ slurmconfig ];
 | 
						||
        services.slurm = {
 | 
						||
          server.enable = true;
 | 
						||
        };
 | 
						||
      };
 | 
						||
 | 
						||
    submit =
 | 
						||
      { ...}:
 | 
						||
      {
 | 
						||
        imports = [ slurmconfig ];
 | 
						||
        services.slurm = {
 | 
						||
          enableStools = true;
 | 
						||
        };
 | 
						||
      };
 | 
						||
 | 
						||
    dbd =
 | 
						||
      { pkgs, ... } :
 | 
						||
      let
 | 
						||
        passFile = pkgs.writeText "dbdpassword" "password123";
 | 
						||
      in {
 | 
						||
        networking.firewall.enable = false;
 | 
						||
        systemd.tmpfiles.rules = [
 | 
						||
          "f /etc/munge/munge.key 0400 munge munge - mungeverryweakkeybuteasytointegratoinatest"
 | 
						||
        ];
 | 
						||
        services.slurm.dbdserver = {
 | 
						||
          enable = true;
 | 
						||
          storagePassFile = "${passFile}";
 | 
						||
        };
 | 
						||
        services.mysql = {
 | 
						||
          enable = true;
 | 
						||
          package = pkgs.mariadb;
 | 
						||
          initialScript = pkgs.writeText "mysql-init.sql" ''
 | 
						||
            CREATE USER 'slurm'@'localhost' IDENTIFIED BY 'password123';
 | 
						||
            GRANT ALL PRIVILEGES ON slurm_acct_db.* TO 'slurm'@'localhost';
 | 
						||
          '';
 | 
						||
          ensureDatabases = [ "slurm_acct_db" ];
 | 
						||
          ensureUsers = [{
 | 
						||
            ensurePermissions = { "slurm_acct_db.*" = "ALL PRIVILEGES"; };
 | 
						||
            name = "slurm";
 | 
						||
          }];
 | 
						||
          settings.mysqld = {
 | 
						||
            # recommendations from: https://slurm.schedmd.com/accounting.html#mysql-configuration
 | 
						||
            innodb_buffer_pool_size="1024M";
 | 
						||
            innodb_log_file_size="64M";
 | 
						||
            innodb_lock_wait_timeout=900;
 | 
						||
          };
 | 
						||
        };
 | 
						||
      };
 | 
						||
 | 
						||
    node1 = computeNode;
 | 
						||
    node2 = computeNode;
 | 
						||
    node3 = computeNode;
 | 
						||
  };
 | 
						||
 | 
						||
 | 
						||
  testScript =
 | 
						||
  ''
 | 
						||
  start_all()
 | 
						||
 | 
						||
  # Make sure DBD is up after DB initialzation
 | 
						||
  with subtest("can_start_slurmdbd"):
 | 
						||
      dbd.succeed("systemctl restart slurmdbd")
 | 
						||
      dbd.wait_for_unit("slurmdbd.service")
 | 
						||
      dbd.wait_for_open_port(6819)
 | 
						||
 | 
						||
  # there needs to be an entry for the current
 | 
						||
  # cluster in the database before slurmctld is restarted
 | 
						||
  with subtest("add_account"):
 | 
						||
      control.succeed("sacctmgr -i add cluster default")
 | 
						||
      # check for cluster entry
 | 
						||
      control.succeed("sacctmgr list cluster | awk '{ print $1 }' | grep default")
 | 
						||
 | 
						||
  with subtest("can_start_slurmctld"):
 | 
						||
      control.succeed("systemctl restart slurmctld")
 | 
						||
      control.wait_for_unit("slurmctld.service")
 | 
						||
 | 
						||
  with subtest("can_start_slurmd"):
 | 
						||
      for node in [node1, node2, node3]:
 | 
						||
          node.succeed("systemctl restart slurmd.service")
 | 
						||
          node.wait_for_unit("slurmd")
 | 
						||
 | 
						||
  # Test that the cluster works and can distribute jobs;
 | 
						||
 | 
						||
  with subtest("run_distributed_command"):
 | 
						||
      # Run `hostname` on 3 nodes of the partition (so on all the 3 nodes).
 | 
						||
      # The output must contain the 3 different names
 | 
						||
      submit.succeed("srun -N 3 hostname | sort | uniq | wc -l | xargs test 3 -eq")
 | 
						||
 | 
						||
      with subtest("check_slurm_dbd"):
 | 
						||
          # find the srun job from above in the database
 | 
						||
          control.succeed("sleep 5")
 | 
						||
          control.succeed("sacct | grep hostname")
 | 
						||
 | 
						||
  with subtest("run_PMIx_mpitest"):
 | 
						||
      submit.succeed("srun -N 3 --mpi=pmix mpitest | grep size=3")
 | 
						||
  '';
 | 
						||
})
 |