From 111d4eb090cdd71b227701842dc67f3bc6dc5f44 Mon Sep 17 00:00:00 2001 From: Markus Kowalewski Date: Tue, 23 Oct 2018 23:43:53 +0200 Subject: [PATCH 1/5] nixos/slurm: run ctld as user and fix spool dir * run as user 'slurm' per default instead of root * add user/group slurm to ids.nix * fix default location for the state dir of slurmctld: (/var/spool -> /var/spool/slurmctld) * Update release notes with the above changes --- nixos/doc/manual/release-notes/rl-1903.xml | 14 +++++++ nixos/modules/misc/ids.nix | 2 + .../services/computing/slurm/slurm.nix | 38 +++++++++++++++++++ 3 files changed, 54 insertions(+) diff --git a/nixos/doc/manual/release-notes/rl-1903.xml b/nixos/doc/manual/release-notes/rl-1903.xml index 9c07184613e..4f3e632c377 100644 --- a/nixos/doc/manual/release-notes/rl-1903.xml +++ b/nixos/doc/manual/release-notes/rl-1903.xml @@ -152,6 +152,20 @@ has been renamed to postgresql_9_6. + + + Slurm introduces the new option + services.slurm.stateSaveLocation, + which is now set to /var/spool/slurm by default + (instead of /var/spool). + Make sure to move all files to the new directory or to set the option accordingly. + + + The slurmctld now runs as user slurm instead of root. + If you want to keep slurmctld running as root, set + services.slurm.user = root. + + diff --git a/nixos/modules/misc/ids.nix b/nixos/modules/misc/ids.nix index 5c30e512a1b..a32e4fe3f7c 100644 --- a/nixos/modules/misc/ids.nix +++ b/nixos/modules/misc/ids.nix @@ -331,6 +331,7 @@ zeronet = 304; lirc = 305; lidarr = 306; + slurm = 307; # When adding a uid, make sure it doesn't match an existing gid. And don't use uids above 399! @@ -622,6 +623,7 @@ zeronet = 304; lirc = 305; lidarr = 306; + slurm = 307; # When adding a gid, make sure it doesn't match an existing # uid. Users and groups with the same name should have equal diff --git a/nixos/modules/services/computing/slurm/slurm.nix b/nixos/modules/services/computing/slurm/slurm.nix index 09174ed39f5..9dbb6a4d0d9 100644 --- a/nixos/modules/services/computing/slurm/slurm.nix +++ b/nixos/modules/services/computing/slurm/slurm.nix @@ -6,9 +6,14 @@ let cfg = config.services.slurm; # configuration file can be generated by http://slurm.schedmd.com/configurator.html + + defaultUser = "slurm"; + configFile = pkgs.writeTextDir "slurm.conf" '' ClusterName=${cfg.clusterName} + StateSaveLocation=${cfg.stateSaveLocation} + SlurmUser=${cfg.user} ${optionalString (cfg.controlMachine != null) ''controlMachine=${cfg.controlMachine}''} ${optionalString (cfg.controlAddr != null) ''controlAddr=${cfg.controlAddr}''} ${optionalString (cfg.nodeName != null) ''nodeName=${cfg.nodeName}''} @@ -159,6 +164,25 @@ in ''; }; + stateSaveLocation = mkOption { + type = types.str; + default = "/var/spool/slurmctld"; + description = '' + Directory into which the Slurm controller, slurmctld, saves its state. + ''; + }; + + user = mkOption { + type = types.str; + default = defaultUser; + description = '' + Set this option when you want to run the slurmctld daemon + as something else than the default slurm user "slurm". + Note that the UID of this user needs to be the same + on all nodes. + ''; + }; + extraConfig = mkOption { default = ""; type = types.lines; @@ -226,6 +250,15 @@ in services.munge.enable = mkDefault true; + # use a static uid as default to ensure it is the same on all nodes + users.users.slurm = mkIf (cfg.user == defaultUser) { + name = defaultUser; + group = "slurm"; + uid = config.ids.uids.slurm; + }; + + users.groups.slurm.gid = config.ids.uids.slurm; + systemd.services.slurmd = mkIf (cfg.client.enable) { path = with pkgs; [ wrappedSlurm coreutils ] ++ lib.optional cfg.enableSrunX11 slurm-spank-x11; @@ -261,6 +294,11 @@ in PIDFile = "/run/slurmctld.pid"; ExecReload = "${pkgs.coreutils}/bin/kill -HUP $MAINPID"; }; + + preStart = '' + mkdir -p ${cfg.stateSaveLocation} + chown -R ${cfg.user}:slurm ${cfg.stateSaveLocation} + ''; }; }; From 79c9dbfb40d5123f18ed64485265d8bc19209091 Mon Sep 17 00:00:00 2001 From: Markus Kowalewski Date: Sat, 15 Sep 2018 13:09:36 +0200 Subject: [PATCH 2/5] nixos/slurm: add slurmdbd to module * New options "services.slurm.dbdserver.[enable,config]" * Add slurmdbd to test slurm.nix --- .../services/computing/slurm/slurm.nix | 55 ++++++++++++++++++- nixos/tests/slurm.nix | 51 +++++++++++++++-- 2 files changed, 99 insertions(+), 7 deletions(-) diff --git a/nixos/modules/services/computing/slurm/slurm.nix b/nixos/modules/services/computing/slurm/slurm.nix index 9dbb6a4d0d9..77b9c40577a 100644 --- a/nixos/modules/services/computing/slurm/slurm.nix +++ b/nixos/modules/services/computing/slurm/slurm.nix @@ -29,12 +29,19 @@ let ${cfg.extraPlugstackConfig} ''; - cgroupConfig = pkgs.writeTextDir "cgroup.conf" '' ${cfg.extraCgroupConfig} ''; + slurmdbdConf = pkgs.writeTextDir "slurmdbd.conf" + '' + DbdHost=${cfg.dbdserver.dbdHost} + SlurmUser=${cfg.user} + StorageType=accounting_storage/mysql + ${cfg.dbdserver.extraConfig} + ''; + # slurm expects some additional config files to be # in the same directory as slurm.conf etcSlurm = pkgs.symlinkJoin { @@ -65,6 +72,27 @@ in }; }; + dbdserver = { + enable = mkEnableOption "SlurmDBD service"; + + dbdHost = mkOption { + type = types.str; + default = config.networking.hostName; + description = '' + Hostname of the machine where slurmdbd + is running (i.e. name returned by hostname -s). + ''; + }; + + extraConfig = mkOption { + type = types.lines; + default = ""; + description = '' + Extra configuration for slurmdbd.conf + ''; + }; + }; + client = { enable = mkEnableOption "slurm client daemon"; }; @@ -208,6 +236,8 @@ in used when procTrackType=proctrack/cgroup. ''; }; + + }; }; @@ -244,7 +274,10 @@ in ''; }; - in mkIf (cfg.enableStools || cfg.client.enable || cfg.server.enable) { + in mkIf ( cfg.enableStools || + cfg.client.enable || + cfg.server.enable || + cfg.dbdserver.enable ) { environment.systemPackages = [ wrappedSlurm ]; @@ -301,6 +334,24 @@ in ''; }; + systemd.services.slurmdbd = mkIf (cfg.dbdserver.enable) { + path = with pkgs; [ wrappedSlurm munge coreutils ]; + + wantedBy = [ "multi-user.target" ]; + after = [ "network.target" "munged.service" "mysql.service" ]; + requires = [ "munged.service" "mysql.service" ]; + + # slurm strips the last component off the path + environment.SLURM_CONF = "${slurmdbdConf}/slurm.conf"; + + serviceConfig = { + Type = "forking"; + ExecStart = "${cfg.package}/bin/slurmdbd"; + PIDFile = "/run/slurmdbd.pid"; + ExecReload = "${pkgs.coreutils}/bin/kill -HUP $MAINPID"; + }; + }; + }; } diff --git a/nixos/tests/slurm.nix b/nixos/tests/slurm.nix index 60f44c3c845..54ea1ee7894 100644 --- a/nixos/tests/slurm.nix +++ b/nixos/tests/slurm.nix @@ -1,5 +1,7 @@ import ./make-test.nix ({ ... }: -let mungekey = "mungeverryweakkeybuteasytointegratoinatest"; +let + mungekey = "mungeverryweakkeybuteasytointegratoinatest"; + slurmconfig = { controlMachine = "control"; nodeName = '' @@ -7,6 +9,10 @@ let mungekey = "mungeverryweakkeybuteasytointegratoinatest"; NodeName=node[1-3] CPUs=1 State=UNKNOWN ''; partitionName = "debug Nodes=node[1-3] Default=YES MaxTime=INFINITE State=UP"; + extraConfig = '' + AccountingStorageHost=dbd + AccountingStorageType=accounting_storage/slurmdbd + ''; }; in { name = "slurm"; @@ -16,7 +22,7 @@ in { computeNode = { ...}: { - # TODO slrumd port and slurmctld port should be configurations and + # TODO slurmd port and slurmctld port should be configurations and # automatically allowed by the firewall. networking.firewall.enable = false; services.slurm = { @@ -43,6 +49,24 @@ in { } // slurmconfig; }; + dbd = + { pkgs, ... } : + { + networking.firewall.enable = false; + services.slurm.dbdserver = { + enable = true; + }; + services.mysql = { + enable = true; + package = pkgs.mysql; + ensureDatabases = [ "slurm_acct_db" ]; + ensureUsers = [{ + ensurePermissions = { "slurm_acct_db.*" = "ALL PRIVILEGES"; }; + name = "slurm"; + }]; + }; + }; + node1 = computeNode; node2 = computeNode; node3 = computeNode; @@ -54,7 +78,7 @@ in { startAll; # Set up authentification across the cluster - foreach my $node (($submit,$control,$node1,$node2,$node3)) + foreach my $node (($submit,$control,$dbd,$node1,$node2,$node3)) { $node->waitForUnit("default.target"); @@ -63,10 +87,22 @@ in { $node->succeed("chmod 0400 /etc/munge/munge.key"); $node->succeed("chown munge:munge /etc/munge/munge.key"); $node->succeed("systemctl restart munged"); - } + + $node->waitForUnit("munged"); + }; # Restart the services since they have probably failed due to the munge init # failure + subtest "can_start_slurmdbd", sub { + $dbd->succeed("systemctl restart slurmdbd"); + $dbd->waitForUnit("slurmdbd.service"); + }; + + # there needs to be an entry for the current + # cluster in the database before slurmctld is restarted + subtest "add_account", sub { + $control->succeed("sacctmgr -i add cluster default"); + }; subtest "can_start_slurmctld", sub { $control->succeed("systemctl restart slurmctld"); @@ -81,12 +117,17 @@ in { } }; - # Test that the cluster work and can distribute jobs; + # Test that the cluster works and can distribute jobs; subtest "run_distributed_command", sub { # Run `hostname` on 3 nodes of the partition (so on all the 3 nodes). # The output must contain the 3 different names $submit->succeed("srun -N 3 hostname | sort | uniq | wc -l | xargs test 3 -eq"); }; + + subtest "check_slurm_dbd", sub { + # find the srun job from above in the database + $submit->succeed("sacct | grep hostname"); + }; ''; }) From f51f7534163ec0159c195657dddd5c10443c54a4 Mon Sep 17 00:00:00 2001 From: Markus Kowalewski Date: Wed, 24 Oct 2018 00:08:39 +0200 Subject: [PATCH 3/5] nixos/slurm: fix obselete string type --- nixos/modules/services/computing/slurm/slurm.nix | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nixos/modules/services/computing/slurm/slurm.nix b/nixos/modules/services/computing/slurm/slurm.nix index 77b9c40577a..d9a201dc7ed 100644 --- a/nixos/modules/services/computing/slurm/slurm.nix +++ b/nixos/modules/services/computing/slurm/slurm.nix @@ -183,7 +183,7 @@ in }; procTrackType = mkOption { - type = types.string; + type = types.str; default = "proctrack/linuxproc"; description = '' Plugin to be used for process tracking on a job step basis. From d2799d1835fdfc68e2b621beff1fa951f055ae39 Mon Sep 17 00:00:00 2001 From: Markus Kowalewski Date: Thu, 25 Oct 2018 20:34:17 +0200 Subject: [PATCH 4/5] nixos/slurm: node/partitionName option -> list Make the node and partitionname options lists. There can be more than paratition or set of nodes. Add changes to release notes --- nixos/doc/manual/release-notes/rl-1903.xml | 6 ++++++ nixos/modules/services/computing/slurm/slurm.nix | 16 ++++++++-------- nixos/tests/slurm.nix | 7 ++----- 3 files changed, 16 insertions(+), 13 deletions(-) diff --git a/nixos/doc/manual/release-notes/rl-1903.xml b/nixos/doc/manual/release-notes/rl-1903.xml index 4f3e632c377..1d9b6ecc0e2 100644 --- a/nixos/doc/manual/release-notes/rl-1903.xml +++ b/nixos/doc/manual/release-notes/rl-1903.xml @@ -165,6 +165,12 @@ If you want to keep slurmctld running as root, set services.slurm.user = root. + + The options services.slurm.nodeName and + services.slurm.partitionName are now sets of + strings to correctly reflect that fact that each of these + options can occour more than once in the configuration. + diff --git a/nixos/modules/services/computing/slurm/slurm.nix b/nixos/modules/services/computing/slurm/slurm.nix index d9a201dc7ed..a3f2367dba4 100644 --- a/nixos/modules/services/computing/slurm/slurm.nix +++ b/nixos/modules/services/computing/slurm/slurm.nix @@ -16,8 +16,8 @@ let SlurmUser=${cfg.user} ${optionalString (cfg.controlMachine != null) ''controlMachine=${cfg.controlMachine}''} ${optionalString (cfg.controlAddr != null) ''controlAddr=${cfg.controlAddr}''} - ${optionalString (cfg.nodeName != null) ''nodeName=${cfg.nodeName}''} - ${optionalString (cfg.partitionName != null) ''partitionName=${cfg.partitionName}''} + ${toString (map (x: "NodeName=${x}\n") cfg.nodeName)} + ${toString (map (x: "PartitionName=${x}\n") cfg.partitionName)} PlugStackConfig=${plugStackConfig} ProctrackType=${cfg.procTrackType} ${cfg.extraConfig} @@ -149,9 +149,9 @@ in }; nodeName = mkOption { - type = types.nullOr types.str; - default = null; - example = "linux[1-32] CPUs=1 State=UNKNOWN"; + type = types.listOf types.str; + default = []; + example = literalExample ''[ "linux[1-32] CPUs=1 State=UNKNOWN" ];''; description = '' Name that SLURM uses to refer to a node (or base partition for BlueGene systems). Typically this would be the string that "/bin/hostname -s" @@ -160,9 +160,9 @@ in }; partitionName = mkOption { - type = types.nullOr types.str; - default = null; - example = "debug Nodes=linux[1-32] Default=YES MaxTime=INFINITE State=UP"; + type = types.listOf types.str; + default = []; + example = literalExample ''[ "debug Nodes=linux[1-32] Default=YES MaxTime=INFINITE State=UP" ];''; description = '' Name by which the partition may be referenced. Note that now you have to write the partition's parameters after the name. diff --git a/nixos/tests/slurm.nix b/nixos/tests/slurm.nix index 54ea1ee7894..6937a5f0991 100644 --- a/nixos/tests/slurm.nix +++ b/nixos/tests/slurm.nix @@ -4,11 +4,8 @@ let slurmconfig = { controlMachine = "control"; - nodeName = '' - control - NodeName=node[1-3] CPUs=1 State=UNKNOWN - ''; - partitionName = "debug Nodes=node[1-3] Default=YES MaxTime=INFINITE State=UP"; + nodeName = [ "node[1-3] CPUs=1 State=UNKNOWN" ]; + partitionName = [ "debug Nodes=node[1-3] Default=YES MaxTime=INFINITE State=UP" ]; extraConfig = '' AccountingStorageHost=dbd AccountingStorageType=accounting_storage/slurmdbd From b388beeca3a3a2dd79ba2f2737deb294d4d4748b Mon Sep 17 00:00:00 2001 From: Markus Kowalewski Date: Thu, 25 Oct 2018 21:03:23 +0200 Subject: [PATCH 5/5] nixos/slurm: add maintainer to module and test --- nixos/modules/services/computing/slurm/slurm.nix | 2 ++ nixos/tests/slurm.nix | 4 +++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/nixos/modules/services/computing/slurm/slurm.nix b/nixos/modules/services/computing/slurm/slurm.nix index a3f2367dba4..cd481212db2 100644 --- a/nixos/modules/services/computing/slurm/slurm.nix +++ b/nixos/modules/services/computing/slurm/slurm.nix @@ -55,6 +55,8 @@ in ###### interface + meta.maintainers = [ maintainers.markuskowa ]; + options = { services.slurm = { diff --git a/nixos/tests/slurm.nix b/nixos/tests/slurm.nix index 6937a5f0991..7f9c266cbff 100644 --- a/nixos/tests/slurm.nix +++ b/nixos/tests/slurm.nix @@ -1,4 +1,4 @@ -import ./make-test.nix ({ ... }: +import ./make-test.nix ({ lib, ... }: let mungekey = "mungeverryweakkeybuteasytointegratoinatest"; @@ -14,6 +14,8 @@ let in { name = "slurm"; + meta.maintainers = [ lib.maintainers.markuskowa ]; + nodes = let computeNode =