| 
									
										
										
										
											2019-11-10 23:07:08 +01:00
										 |  |  |  | import ./make-test-python.nix ({ lib, ... }: | 
					
						
							| 
									
										
										
										
											2018-09-15 13:09:36 +02:00
										 |  |  |  | let | 
					
						
							|  |  |  |  |     mungekey = "mungeverryweakkeybuteasytointegratoinatest"; | 
					
						
							|  |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-12-25 15:55:07 +01:00
										 |  |  |  |     slurmconfig = { | 
					
						
							|  |  |  |  |       controlMachine = "control"; | 
					
						
							| 
									
										
										
										
											2018-10-25 20:34:17 +02:00
										 |  |  |  |       nodeName = [ "node[1-3] CPUs=1 State=UNKNOWN" ]; | 
					
						
							|  |  |  |  |       partitionName = [ "debug Nodes=node[1-3] Default=YES MaxTime=INFINITE State=UP" ]; | 
					
						
							| 
									
										
										
										
											2018-09-15 13:09:36 +02:00
										 |  |  |  |       extraConfig = ''
 | 
					
						
							|  |  |  |  |         AccountingStorageHost=dbd | 
					
						
							|  |  |  |  |         AccountingStorageType=accounting_storage/slurmdbd | 
					
						
							|  |  |  |  |       '';
 | 
					
						
							| 
									
										
										
										
											2015-12-25 15:55:07 +01:00
										 |  |  |  |     }; | 
					
						
							|  |  |  |  | in { | 
					
						
							|  |  |  |  |   name = "slurm"; | 
					
						
							|  |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2018-10-25 21:03:23 +02:00
										 |  |  |  |   meta.maintainers = [ lib.maintainers.markuskowa ]; | 
					
						
							|  |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-12-25 15:55:07 +01:00
										 |  |  |  |   nodes = | 
					
						
							|  |  |  |  |     let | 
					
						
							|  |  |  |  |     computeNode = | 
					
						
							| 
									
										
										
										
											2018-07-20 20:56:59 +00:00
										 |  |  |  |       { ...}: | 
					
						
							| 
									
										
										
										
											2015-12-25 15:55:07 +01:00
										 |  |  |  |       { | 
					
						
							| 
									
										
										
										
											2018-09-15 13:09:36 +02:00
										 |  |  |  |         # TODO slurmd port and slurmctld port should be configurations and | 
					
						
							| 
									
										
										
										
											2015-12-25 15:55:07 +01:00
										 |  |  |  |         # automatically allowed by the  firewall. | 
					
						
							|  |  |  |  |         networking.firewall.enable = false; | 
					
						
							| 
									
										
										
										
											2018-06-01 23:42:21 +02:00
										 |  |  |  |         services.slurm = { | 
					
						
							|  |  |  |  |           client.enable = true; | 
					
						
							|  |  |  |  |         } // slurmconfig; | 
					
						
							| 
									
										
										
										
											2015-12-25 15:55:07 +01:00
										 |  |  |  |       }; | 
					
						
							|  |  |  |  |     in { | 
					
						
							| 
									
										
										
										
											2018-06-01 23:42:21 +02:00
										 |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-12-25 15:55:07 +01:00
										 |  |  |  |     control = | 
					
						
							| 
									
										
										
										
											2018-07-20 20:56:59 +00:00
										 |  |  |  |       { ...}: | 
					
						
							| 
									
										
										
										
											2015-12-25 15:55:07 +01:00
										 |  |  |  |       { | 
					
						
							|  |  |  |  |         networking.firewall.enable = false; | 
					
						
							|  |  |  |  |         services.slurm = { | 
					
						
							|  |  |  |  |           server.enable = true; | 
					
						
							|  |  |  |  |         } // slurmconfig; | 
					
						
							|  |  |  |  |       }; | 
					
						
							| 
									
										
										
										
											2018-06-01 23:42:21 +02:00
										 |  |  |  | 
 | 
					
						
							|  |  |  |  |     submit = | 
					
						
							| 
									
										
										
										
											2018-07-20 20:56:59 +00:00
										 |  |  |  |       { ...}: | 
					
						
							| 
									
										
										
										
											2018-06-01 23:42:21 +02:00
										 |  |  |  |       { | 
					
						
							|  |  |  |  |         networking.firewall.enable = false; | 
					
						
							|  |  |  |  |         services.slurm = { | 
					
						
							|  |  |  |  |           enableStools = true; | 
					
						
							|  |  |  |  |         } // slurmconfig; | 
					
						
							|  |  |  |  |       }; | 
					
						
							|  |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2018-09-15 13:09:36 +02:00
										 |  |  |  |     dbd = | 
					
						
							|  |  |  |  |       { pkgs, ... } : | 
					
						
							|  |  |  |  |       { | 
					
						
							|  |  |  |  |         networking.firewall.enable = false; | 
					
						
							|  |  |  |  |         services.slurm.dbdserver = { | 
					
						
							|  |  |  |  |           enable = true; | 
					
						
							| 
									
										
										
										
											2019-11-10 21:28:09 +01:00
										 |  |  |  |           storagePass = "password123"; | 
					
						
							| 
									
										
										
										
											2018-09-15 13:09:36 +02:00
										 |  |  |  |         }; | 
					
						
							|  |  |  |  |         services.mysql = { | 
					
						
							|  |  |  |  |           enable = true; | 
					
						
							| 
									
										
										
										
											2019-11-10 21:28:09 +01:00
										 |  |  |  |           package = pkgs.mariadb; | 
					
						
							|  |  |  |  |           initialScript = pkgs.writeText "mysql-init.sql" ''
 | 
					
						
							|  |  |  |  |             CREATE USER 'slurm'@'localhost' IDENTIFIED BY 'password123'; | 
					
						
							|  |  |  |  |             GRANT ALL PRIVILEGES ON slurm_acct_db.* TO 'slurm'@'localhost'; | 
					
						
							|  |  |  |  |           '';
 | 
					
						
							| 
									
										
										
										
											2018-09-15 13:09:36 +02:00
										 |  |  |  |           ensureDatabases = [ "slurm_acct_db" ]; | 
					
						
							|  |  |  |  |           ensureUsers = [{ | 
					
						
							|  |  |  |  |             ensurePermissions = { "slurm_acct_db.*" = "ALL PRIVILEGES"; }; | 
					
						
							|  |  |  |  |             name = "slurm"; | 
					
						
							|  |  |  |  |           }]; | 
					
						
							| 
									
										
										
										
											2018-11-22 13:21:37 +01:00
										 |  |  |  |           extraOptions = ''
 | 
					
						
							|  |  |  |  |             # recommendations from: https://slurm.schedmd.com/accounting.html#mysql-configuration | 
					
						
							|  |  |  |  |             innodb_buffer_pool_size=1024M | 
					
						
							|  |  |  |  |             innodb_log_file_size=64M | 
					
						
							|  |  |  |  |             innodb_lock_wait_timeout=900 | 
					
						
							|  |  |  |  |           '';
 | 
					
						
							| 
									
										
										
										
											2018-09-15 13:09:36 +02:00
										 |  |  |  |         }; | 
					
						
							|  |  |  |  |       }; | 
					
						
							|  |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-12-25 15:55:07 +01:00
										 |  |  |  |     node1 = computeNode; | 
					
						
							|  |  |  |  |     node2 = computeNode; | 
					
						
							|  |  |  |  |     node3 = computeNode; | 
					
						
							|  |  |  |  |   }; | 
					
						
							|  |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2018-06-01 23:42:21 +02:00
										 |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-12-25 15:55:07 +01:00
										 |  |  |  |   testScript = | 
					
						
							|  |  |  |  |   ''
 | 
					
						
							| 
									
										
										
										
											2019-11-10 23:07:08 +01:00
										 |  |  |  |   start_all() | 
					
						
							| 
									
										
										
										
											2015-12-25 15:55:07 +01:00
										 |  |  |  | 
 | 
					
						
							|  |  |  |  |   # Set up authentification across the cluster | 
					
						
							| 
									
										
										
										
											2019-11-10 23:07:08 +01:00
										 |  |  |  |   for node in [submit, control, dbd, node1, node2, node3]: | 
					
						
							| 
									
										
										
										
											2015-12-25 15:55:07 +01:00
										 |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2019-11-10 23:07:08 +01:00
										 |  |  |  |       node.wait_for_unit("default.target") | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |       node.succeed("mkdir /etc/munge") | 
					
						
							|  |  |  |  |       node.succeed( | 
					
						
							|  |  |  |  |           "echo '${mungekey}' > /etc/munge/munge.key" | 
					
						
							|  |  |  |  |       ) | 
					
						
							|  |  |  |  |       node.succeed("chmod 0400 /etc/munge/munge.key") | 
					
						
							|  |  |  |  |       node.succeed("chown munge:munge /etc/munge/munge.key") | 
					
						
							|  |  |  |  |       node.succeed("systemctl restart munged") | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |       node.wait_for_unit("munged") | 
					
						
							| 
									
										
										
										
											2018-09-15 13:09:36 +02:00
										 |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-12-25 15:55:07 +01:00
										 |  |  |  | 
 | 
					
						
							|  |  |  |  |   # Restart the services since they have probably failed due to the munge init | 
					
						
							|  |  |  |  |   # failure | 
					
						
							| 
									
										
										
										
											2019-11-10 23:07:08 +01:00
										 |  |  |  |   with subtest("can_start_slurmdbd"): | 
					
						
							|  |  |  |  |       dbd.succeed("systemctl restart slurmdbd") | 
					
						
							|  |  |  |  |       dbd.wait_for_unit("slurmdbd.service") | 
					
						
							|  |  |  |  |       dbd.wait_for_open_port(6819) | 
					
						
							| 
									
										
										
										
											2018-09-15 13:09:36 +02:00
										 |  |  |  | 
 | 
					
						
							|  |  |  |  |   # there needs to be an entry for the current | 
					
						
							|  |  |  |  |   # cluster in the database before slurmctld is restarted | 
					
						
							| 
									
										
										
										
											2019-11-10 23:07:08 +01:00
										 |  |  |  |   with subtest("add_account"): | 
					
						
							|  |  |  |  |       control.succeed("sacctmgr -i add cluster default") | 
					
						
							|  |  |  |  |       # check for cluster entry | 
					
						
							|  |  |  |  |       control.succeed("sacctmgr list cluster | awk '{ print $1 }' | grep default") | 
					
						
							| 
									
										
										
										
											2015-12-25 15:55:07 +01:00
										 |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2019-11-10 23:07:08 +01:00
										 |  |  |  |   with subtest("can_start_slurmctld"): | 
					
						
							|  |  |  |  |       control.succeed("systemctl restart slurmctld") | 
					
						
							| 
									
										
										
										
											2019-12-24 19:51:18 +01:00
										 |  |  |  |       control.wait_for_unit("slurmctld.service") | 
					
						
							| 
									
										
										
										
											2015-12-25 15:55:07 +01:00
										 |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2019-11-10 23:07:08 +01:00
										 |  |  |  |   with subtest("can_start_slurmd"): | 
					
						
							|  |  |  |  |       for node in [node1, node2, node3]: | 
					
						
							|  |  |  |  |           node.succeed("systemctl restart slurmd.service") | 
					
						
							|  |  |  |  |           node.wait_for_unit("slurmd") | 
					
						
							| 
									
										
										
										
											2015-12-25 15:55:07 +01:00
										 |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2018-09-15 13:09:36 +02:00
										 |  |  |  |   # Test that the cluster works and can distribute jobs; | 
					
						
							| 
									
										
										
										
											2015-12-25 15:55:07 +01:00
										 |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2019-11-10 23:07:08 +01:00
										 |  |  |  |   with subtest("run_distributed_command"): | 
					
						
							|  |  |  |  |       # Run `hostname` on 3 nodes of the partition (so on all the 3 nodes). | 
					
						
							|  |  |  |  |       # The output must contain the 3 different names | 
					
						
							|  |  |  |  |       submit.succeed("srun -N 3 hostname | sort | uniq | wc -l | xargs test 3 -eq") | 
					
						
							| 
									
										
										
										
											2018-09-15 13:09:36 +02:00
										 |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2019-11-10 23:07:08 +01:00
										 |  |  |  |       with subtest("check_slurm_dbd"): | 
					
						
							|  |  |  |  |           # find the srun job from above in the database | 
					
						
							|  |  |  |  |           control.succeed("sleep 5") | 
					
						
							|  |  |  |  |           control.succeed("sacct | grep hostname") | 
					
						
							| 
									
										
										
										
											2015-12-25 15:55:07 +01:00
										 |  |  |  |   '';
 | 
					
						
							|  |  |  |  | }) |