Merge pull request #49348 from markuskowa/mod-slurm-upgrade
nixos/slurm: add slurmdbd, run daemons as user
This commit is contained in:
commit
6efd811062
@ -152,6 +152,26 @@
|
|||||||
has been renamed to <varname>postgresql_9_6</varname>.
|
has been renamed to <varname>postgresql_9_6</varname>.
|
||||||
</para>
|
</para>
|
||||||
</listitem>
|
</listitem>
|
||||||
|
<listitem>
|
||||||
|
<para>
|
||||||
|
Slurm introduces the new option
|
||||||
|
<literal>services.slurm.stateSaveLocation</literal>,
|
||||||
|
which is now set to <literal>/var/spool/slurm</literal> by default
|
||||||
|
(instead of <literal>/var/spool</literal>).
|
||||||
|
Make sure to move all files to the new directory or to set the option accordingly.
|
||||||
|
</para>
|
||||||
|
<para>
|
||||||
|
The slurmctld now runs as user <literal>slurm</literal> instead of <literal>root</literal>.
|
||||||
|
If you want to keep slurmctld running as <literal>root</literal>, set
|
||||||
|
<literal>services.slurm.user = root</literal>.
|
||||||
|
</para>
|
||||||
|
<para>
|
||||||
|
The options <literal>services.slurm.nodeName</literal> and
|
||||||
|
<literal>services.slurm.partitionName</literal> are now sets of
|
||||||
|
strings to correctly reflect that fact that each of these
|
||||||
|
options can occour more than once in the configuration.
|
||||||
|
</para>
|
||||||
|
</listitem>
|
||||||
</itemizedlist>
|
</itemizedlist>
|
||||||
</section>
|
</section>
|
||||||
|
|
||||||
|
@ -331,6 +331,7 @@
|
|||||||
zeronet = 304;
|
zeronet = 304;
|
||||||
lirc = 305;
|
lirc = 305;
|
||||||
lidarr = 306;
|
lidarr = 306;
|
||||||
|
slurm = 307;
|
||||||
|
|
||||||
# When adding a uid, make sure it doesn't match an existing gid. And don't use uids above 399!
|
# When adding a uid, make sure it doesn't match an existing gid. And don't use uids above 399!
|
||||||
|
|
||||||
@ -622,6 +623,7 @@
|
|||||||
zeronet = 304;
|
zeronet = 304;
|
||||||
lirc = 305;
|
lirc = 305;
|
||||||
lidarr = 306;
|
lidarr = 306;
|
||||||
|
slurm = 307;
|
||||||
|
|
||||||
# When adding a gid, make sure it doesn't match an existing
|
# When adding a gid, make sure it doesn't match an existing
|
||||||
# uid. Users and groups with the same name should have equal
|
# uid. Users and groups with the same name should have equal
|
||||||
|
@ -6,13 +6,18 @@ let
|
|||||||
|
|
||||||
cfg = config.services.slurm;
|
cfg = config.services.slurm;
|
||||||
# configuration file can be generated by http://slurm.schedmd.com/configurator.html
|
# configuration file can be generated by http://slurm.schedmd.com/configurator.html
|
||||||
|
|
||||||
|
defaultUser = "slurm";
|
||||||
|
|
||||||
configFile = pkgs.writeTextDir "slurm.conf"
|
configFile = pkgs.writeTextDir "slurm.conf"
|
||||||
''
|
''
|
||||||
ClusterName=${cfg.clusterName}
|
ClusterName=${cfg.clusterName}
|
||||||
|
StateSaveLocation=${cfg.stateSaveLocation}
|
||||||
|
SlurmUser=${cfg.user}
|
||||||
${optionalString (cfg.controlMachine != null) ''controlMachine=${cfg.controlMachine}''}
|
${optionalString (cfg.controlMachine != null) ''controlMachine=${cfg.controlMachine}''}
|
||||||
${optionalString (cfg.controlAddr != null) ''controlAddr=${cfg.controlAddr}''}
|
${optionalString (cfg.controlAddr != null) ''controlAddr=${cfg.controlAddr}''}
|
||||||
${optionalString (cfg.nodeName != null) ''nodeName=${cfg.nodeName}''}
|
${toString (map (x: "NodeName=${x}\n") cfg.nodeName)}
|
||||||
${optionalString (cfg.partitionName != null) ''partitionName=${cfg.partitionName}''}
|
${toString (map (x: "PartitionName=${x}\n") cfg.partitionName)}
|
||||||
PlugStackConfig=${plugStackConfig}
|
PlugStackConfig=${plugStackConfig}
|
||||||
ProctrackType=${cfg.procTrackType}
|
ProctrackType=${cfg.procTrackType}
|
||||||
${cfg.extraConfig}
|
${cfg.extraConfig}
|
||||||
@ -24,12 +29,19 @@ let
|
|||||||
${cfg.extraPlugstackConfig}
|
${cfg.extraPlugstackConfig}
|
||||||
'';
|
'';
|
||||||
|
|
||||||
|
|
||||||
cgroupConfig = pkgs.writeTextDir "cgroup.conf"
|
cgroupConfig = pkgs.writeTextDir "cgroup.conf"
|
||||||
''
|
''
|
||||||
${cfg.extraCgroupConfig}
|
${cfg.extraCgroupConfig}
|
||||||
'';
|
'';
|
||||||
|
|
||||||
|
slurmdbdConf = pkgs.writeTextDir "slurmdbd.conf"
|
||||||
|
''
|
||||||
|
DbdHost=${cfg.dbdserver.dbdHost}
|
||||||
|
SlurmUser=${cfg.user}
|
||||||
|
StorageType=accounting_storage/mysql
|
||||||
|
${cfg.dbdserver.extraConfig}
|
||||||
|
'';
|
||||||
|
|
||||||
# slurm expects some additional config files to be
|
# slurm expects some additional config files to be
|
||||||
# in the same directory as slurm.conf
|
# in the same directory as slurm.conf
|
||||||
etcSlurm = pkgs.symlinkJoin {
|
etcSlurm = pkgs.symlinkJoin {
|
||||||
@ -43,6 +55,8 @@ in
|
|||||||
|
|
||||||
###### interface
|
###### interface
|
||||||
|
|
||||||
|
meta.maintainers = [ maintainers.markuskowa ];
|
||||||
|
|
||||||
options = {
|
options = {
|
||||||
|
|
||||||
services.slurm = {
|
services.slurm = {
|
||||||
@ -60,6 +74,27 @@ in
|
|||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
|
dbdserver = {
|
||||||
|
enable = mkEnableOption "SlurmDBD service";
|
||||||
|
|
||||||
|
dbdHost = mkOption {
|
||||||
|
type = types.str;
|
||||||
|
default = config.networking.hostName;
|
||||||
|
description = ''
|
||||||
|
Hostname of the machine where <literal>slurmdbd</literal>
|
||||||
|
is running (i.e. name returned by <literal>hostname -s</literal>).
|
||||||
|
'';
|
||||||
|
};
|
||||||
|
|
||||||
|
extraConfig = mkOption {
|
||||||
|
type = types.lines;
|
||||||
|
default = "";
|
||||||
|
description = ''
|
||||||
|
Extra configuration for <literal>slurmdbd.conf</literal>
|
||||||
|
'';
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
client = {
|
client = {
|
||||||
enable = mkEnableOption "slurm client daemon";
|
enable = mkEnableOption "slurm client daemon";
|
||||||
};
|
};
|
||||||
@ -116,9 +151,9 @@ in
|
|||||||
};
|
};
|
||||||
|
|
||||||
nodeName = mkOption {
|
nodeName = mkOption {
|
||||||
type = types.nullOr types.str;
|
type = types.listOf types.str;
|
||||||
default = null;
|
default = [];
|
||||||
example = "linux[1-32] CPUs=1 State=UNKNOWN";
|
example = literalExample ''[ "linux[1-32] CPUs=1 State=UNKNOWN" ];'';
|
||||||
description = ''
|
description = ''
|
||||||
Name that SLURM uses to refer to a node (or base partition for BlueGene
|
Name that SLURM uses to refer to a node (or base partition for BlueGene
|
||||||
systems). Typically this would be the string that "/bin/hostname -s"
|
systems). Typically this would be the string that "/bin/hostname -s"
|
||||||
@ -127,9 +162,9 @@ in
|
|||||||
};
|
};
|
||||||
|
|
||||||
partitionName = mkOption {
|
partitionName = mkOption {
|
||||||
type = types.nullOr types.str;
|
type = types.listOf types.str;
|
||||||
default = null;
|
default = [];
|
||||||
example = "debug Nodes=linux[1-32] Default=YES MaxTime=INFINITE State=UP";
|
example = literalExample ''[ "debug Nodes=linux[1-32] Default=YES MaxTime=INFINITE State=UP" ];'';
|
||||||
description = ''
|
description = ''
|
||||||
Name by which the partition may be referenced. Note that now you have
|
Name by which the partition may be referenced. Note that now you have
|
||||||
to write the partition's parameters after the name.
|
to write the partition's parameters after the name.
|
||||||
@ -150,7 +185,7 @@ in
|
|||||||
};
|
};
|
||||||
|
|
||||||
procTrackType = mkOption {
|
procTrackType = mkOption {
|
||||||
type = types.string;
|
type = types.str;
|
||||||
default = "proctrack/linuxproc";
|
default = "proctrack/linuxproc";
|
||||||
description = ''
|
description = ''
|
||||||
Plugin to be used for process tracking on a job step basis.
|
Plugin to be used for process tracking on a job step basis.
|
||||||
@ -159,6 +194,25 @@ in
|
|||||||
'';
|
'';
|
||||||
};
|
};
|
||||||
|
|
||||||
|
stateSaveLocation = mkOption {
|
||||||
|
type = types.str;
|
||||||
|
default = "/var/spool/slurmctld";
|
||||||
|
description = ''
|
||||||
|
Directory into which the Slurm controller, slurmctld, saves its state.
|
||||||
|
'';
|
||||||
|
};
|
||||||
|
|
||||||
|
user = mkOption {
|
||||||
|
type = types.str;
|
||||||
|
default = defaultUser;
|
||||||
|
description = ''
|
||||||
|
Set this option when you want to run the slurmctld daemon
|
||||||
|
as something else than the default slurm user "slurm".
|
||||||
|
Note that the UID of this user needs to be the same
|
||||||
|
on all nodes.
|
||||||
|
'';
|
||||||
|
};
|
||||||
|
|
||||||
extraConfig = mkOption {
|
extraConfig = mkOption {
|
||||||
default = "";
|
default = "";
|
||||||
type = types.lines;
|
type = types.lines;
|
||||||
@ -184,6 +238,8 @@ in
|
|||||||
used when <literal>procTrackType=proctrack/cgroup</literal>.
|
used when <literal>procTrackType=proctrack/cgroup</literal>.
|
||||||
'';
|
'';
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
};
|
};
|
||||||
@ -220,12 +276,24 @@ in
|
|||||||
'';
|
'';
|
||||||
};
|
};
|
||||||
|
|
||||||
in mkIf (cfg.enableStools || cfg.client.enable || cfg.server.enable) {
|
in mkIf ( cfg.enableStools ||
|
||||||
|
cfg.client.enable ||
|
||||||
|
cfg.server.enable ||
|
||||||
|
cfg.dbdserver.enable ) {
|
||||||
|
|
||||||
environment.systemPackages = [ wrappedSlurm ];
|
environment.systemPackages = [ wrappedSlurm ];
|
||||||
|
|
||||||
services.munge.enable = mkDefault true;
|
services.munge.enable = mkDefault true;
|
||||||
|
|
||||||
|
# use a static uid as default to ensure it is the same on all nodes
|
||||||
|
users.users.slurm = mkIf (cfg.user == defaultUser) {
|
||||||
|
name = defaultUser;
|
||||||
|
group = "slurm";
|
||||||
|
uid = config.ids.uids.slurm;
|
||||||
|
};
|
||||||
|
|
||||||
|
users.groups.slurm.gid = config.ids.uids.slurm;
|
||||||
|
|
||||||
systemd.services.slurmd = mkIf (cfg.client.enable) {
|
systemd.services.slurmd = mkIf (cfg.client.enable) {
|
||||||
path = with pkgs; [ wrappedSlurm coreutils ]
|
path = with pkgs; [ wrappedSlurm coreutils ]
|
||||||
++ lib.optional cfg.enableSrunX11 slurm-spank-x11;
|
++ lib.optional cfg.enableSrunX11 slurm-spank-x11;
|
||||||
@ -261,6 +329,29 @@ in
|
|||||||
PIDFile = "/run/slurmctld.pid";
|
PIDFile = "/run/slurmctld.pid";
|
||||||
ExecReload = "${pkgs.coreutils}/bin/kill -HUP $MAINPID";
|
ExecReload = "${pkgs.coreutils}/bin/kill -HUP $MAINPID";
|
||||||
};
|
};
|
||||||
|
|
||||||
|
preStart = ''
|
||||||
|
mkdir -p ${cfg.stateSaveLocation}
|
||||||
|
chown -R ${cfg.user}:slurm ${cfg.stateSaveLocation}
|
||||||
|
'';
|
||||||
|
};
|
||||||
|
|
||||||
|
systemd.services.slurmdbd = mkIf (cfg.dbdserver.enable) {
|
||||||
|
path = with pkgs; [ wrappedSlurm munge coreutils ];
|
||||||
|
|
||||||
|
wantedBy = [ "multi-user.target" ];
|
||||||
|
after = [ "network.target" "munged.service" "mysql.service" ];
|
||||||
|
requires = [ "munged.service" "mysql.service" ];
|
||||||
|
|
||||||
|
# slurm strips the last component off the path
|
||||||
|
environment.SLURM_CONF = "${slurmdbdConf}/slurm.conf";
|
||||||
|
|
||||||
|
serviceConfig = {
|
||||||
|
Type = "forking";
|
||||||
|
ExecStart = "${cfg.package}/bin/slurmdbd";
|
||||||
|
PIDFile = "/run/slurmdbd.pid";
|
||||||
|
ExecReload = "${pkgs.coreutils}/bin/kill -HUP $MAINPID";
|
||||||
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
};
|
};
|
||||||
|
@ -1,22 +1,27 @@
|
|||||||
import ./make-test.nix ({ ... }:
|
import ./make-test.nix ({ lib, ... }:
|
||||||
let mungekey = "mungeverryweakkeybuteasytointegratoinatest";
|
let
|
||||||
|
mungekey = "mungeverryweakkeybuteasytointegratoinatest";
|
||||||
|
|
||||||
slurmconfig = {
|
slurmconfig = {
|
||||||
controlMachine = "control";
|
controlMachine = "control";
|
||||||
nodeName = ''
|
nodeName = [ "node[1-3] CPUs=1 State=UNKNOWN" ];
|
||||||
control
|
partitionName = [ "debug Nodes=node[1-3] Default=YES MaxTime=INFINITE State=UP" ];
|
||||||
NodeName=node[1-3] CPUs=1 State=UNKNOWN
|
extraConfig = ''
|
||||||
|
AccountingStorageHost=dbd
|
||||||
|
AccountingStorageType=accounting_storage/slurmdbd
|
||||||
'';
|
'';
|
||||||
partitionName = "debug Nodes=node[1-3] Default=YES MaxTime=INFINITE State=UP";
|
|
||||||
};
|
};
|
||||||
in {
|
in {
|
||||||
name = "slurm";
|
name = "slurm";
|
||||||
|
|
||||||
|
meta.maintainers = [ lib.maintainers.markuskowa ];
|
||||||
|
|
||||||
nodes =
|
nodes =
|
||||||
let
|
let
|
||||||
computeNode =
|
computeNode =
|
||||||
{ ...}:
|
{ ...}:
|
||||||
{
|
{
|
||||||
# TODO slrumd port and slurmctld port should be configurations and
|
# TODO slurmd port and slurmctld port should be configurations and
|
||||||
# automatically allowed by the firewall.
|
# automatically allowed by the firewall.
|
||||||
networking.firewall.enable = false;
|
networking.firewall.enable = false;
|
||||||
services.slurm = {
|
services.slurm = {
|
||||||
@ -43,6 +48,24 @@ in {
|
|||||||
} // slurmconfig;
|
} // slurmconfig;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
dbd =
|
||||||
|
{ pkgs, ... } :
|
||||||
|
{
|
||||||
|
networking.firewall.enable = false;
|
||||||
|
services.slurm.dbdserver = {
|
||||||
|
enable = true;
|
||||||
|
};
|
||||||
|
services.mysql = {
|
||||||
|
enable = true;
|
||||||
|
package = pkgs.mysql;
|
||||||
|
ensureDatabases = [ "slurm_acct_db" ];
|
||||||
|
ensureUsers = [{
|
||||||
|
ensurePermissions = { "slurm_acct_db.*" = "ALL PRIVILEGES"; };
|
||||||
|
name = "slurm";
|
||||||
|
}];
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
node1 = computeNode;
|
node1 = computeNode;
|
||||||
node2 = computeNode;
|
node2 = computeNode;
|
||||||
node3 = computeNode;
|
node3 = computeNode;
|
||||||
@ -54,7 +77,7 @@ in {
|
|||||||
startAll;
|
startAll;
|
||||||
|
|
||||||
# Set up authentification across the cluster
|
# Set up authentification across the cluster
|
||||||
foreach my $node (($submit,$control,$node1,$node2,$node3))
|
foreach my $node (($submit,$control,$dbd,$node1,$node2,$node3))
|
||||||
{
|
{
|
||||||
$node->waitForUnit("default.target");
|
$node->waitForUnit("default.target");
|
||||||
|
|
||||||
@ -63,10 +86,22 @@ in {
|
|||||||
$node->succeed("chmod 0400 /etc/munge/munge.key");
|
$node->succeed("chmod 0400 /etc/munge/munge.key");
|
||||||
$node->succeed("chown munge:munge /etc/munge/munge.key");
|
$node->succeed("chown munge:munge /etc/munge/munge.key");
|
||||||
$node->succeed("systemctl restart munged");
|
$node->succeed("systemctl restart munged");
|
||||||
}
|
|
||||||
|
$node->waitForUnit("munged");
|
||||||
|
};
|
||||||
|
|
||||||
# Restart the services since they have probably failed due to the munge init
|
# Restart the services since they have probably failed due to the munge init
|
||||||
# failure
|
# failure
|
||||||
|
subtest "can_start_slurmdbd", sub {
|
||||||
|
$dbd->succeed("systemctl restart slurmdbd");
|
||||||
|
$dbd->waitForUnit("slurmdbd.service");
|
||||||
|
};
|
||||||
|
|
||||||
|
# there needs to be an entry for the current
|
||||||
|
# cluster in the database before slurmctld is restarted
|
||||||
|
subtest "add_account", sub {
|
||||||
|
$control->succeed("sacctmgr -i add cluster default");
|
||||||
|
};
|
||||||
|
|
||||||
subtest "can_start_slurmctld", sub {
|
subtest "can_start_slurmctld", sub {
|
||||||
$control->succeed("systemctl restart slurmctld");
|
$control->succeed("systemctl restart slurmctld");
|
||||||
@ -81,12 +116,17 @@ in {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
# Test that the cluster work and can distribute jobs;
|
# Test that the cluster works and can distribute jobs;
|
||||||
|
|
||||||
subtest "run_distributed_command", sub {
|
subtest "run_distributed_command", sub {
|
||||||
# Run `hostname` on 3 nodes of the partition (so on all the 3 nodes).
|
# Run `hostname` on 3 nodes of the partition (so on all the 3 nodes).
|
||||||
# The output must contain the 3 different names
|
# The output must contain the 3 different names
|
||||||
$submit->succeed("srun -N 3 hostname | sort | uniq | wc -l | xargs test 3 -eq");
|
$submit->succeed("srun -N 3 hostname | sort | uniq | wc -l | xargs test 3 -eq");
|
||||||
};
|
};
|
||||||
|
|
||||||
|
subtest "check_slurm_dbd", sub {
|
||||||
|
# find the srun job from above in the database
|
||||||
|
$submit->succeed("sacct | grep hostname");
|
||||||
|
};
|
||||||
'';
|
'';
|
||||||
})
|
})
|
||||||
|
Loading…
x
Reference in New Issue
Block a user