| 
									
										
										
										
											2018-10-25 21:03:23 +02:00
										 |  |  |  | import ./make-test.nix ({ lib, ... }: | 
					
						
							| 
									
										
										
										
											2018-09-15 13:09:36 +02:00
										 |  |  |  | let | 
					
						
							|  |  |  |  |     mungekey = "mungeverryweakkeybuteasytointegratoinatest"; | 
					
						
							|  |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-12-25 15:55:07 +01:00
										 |  |  |  |     slurmconfig = { | 
					
						
							|  |  |  |  |       controlMachine = "control"; | 
					
						
							| 
									
										
										
										
											2018-10-25 20:34:17 +02:00
										 |  |  |  |       nodeName = [ "node[1-3] CPUs=1 State=UNKNOWN" ]; | 
					
						
							|  |  |  |  |       partitionName = [ "debug Nodes=node[1-3] Default=YES MaxTime=INFINITE State=UP" ]; | 
					
						
							| 
									
										
										
										
											2018-09-15 13:09:36 +02:00
										 |  |  |  |       extraConfig = ''
 | 
					
						
							|  |  |  |  |         AccountingStorageHost=dbd | 
					
						
							|  |  |  |  |         AccountingStorageType=accounting_storage/slurmdbd | 
					
						
							|  |  |  |  |       '';
 | 
					
						
							| 
									
										
										
										
											2015-12-25 15:55:07 +01:00
										 |  |  |  |     }; | 
					
						
							|  |  |  |  | in { | 
					
						
							|  |  |  |  |   name = "slurm"; | 
					
						
							|  |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2018-10-25 21:03:23 +02:00
										 |  |  |  |   meta.maintainers = [ lib.maintainers.markuskowa ]; | 
					
						
							|  |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-12-25 15:55:07 +01:00
										 |  |  |  |   nodes = | 
					
						
							|  |  |  |  |     let | 
					
						
							|  |  |  |  |     computeNode = | 
					
						
							| 
									
										
										
										
											2018-07-20 20:56:59 +00:00
										 |  |  |  |       { ...}: | 
					
						
							| 
									
										
										
										
											2015-12-25 15:55:07 +01:00
										 |  |  |  |       { | 
					
						
							| 
									
										
										
										
											2018-09-15 13:09:36 +02:00
										 |  |  |  |         # TODO slurmd port and slurmctld port should be configurations and | 
					
						
							| 
									
										
										
										
											2015-12-25 15:55:07 +01:00
										 |  |  |  |         # automatically allowed by the  firewall. | 
					
						
							|  |  |  |  |         networking.firewall.enable = false; | 
					
						
							| 
									
										
										
										
											2018-06-01 23:42:21 +02:00
										 |  |  |  |         services.slurm = { | 
					
						
							|  |  |  |  |           client.enable = true; | 
					
						
							|  |  |  |  |         } // slurmconfig; | 
					
						
							| 
									
										
										
										
											2015-12-25 15:55:07 +01:00
										 |  |  |  |       }; | 
					
						
							|  |  |  |  |     in { | 
					
						
							| 
									
										
										
										
											2018-06-01 23:42:21 +02:00
										 |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-12-25 15:55:07 +01:00
										 |  |  |  |     control = | 
					
						
							| 
									
										
										
										
											2018-07-20 20:56:59 +00:00
										 |  |  |  |       { ...}: | 
					
						
							| 
									
										
										
										
											2015-12-25 15:55:07 +01:00
										 |  |  |  |       { | 
					
						
							|  |  |  |  |         networking.firewall.enable = false; | 
					
						
							|  |  |  |  |         services.slurm = { | 
					
						
							|  |  |  |  |           server.enable = true; | 
					
						
							|  |  |  |  |         } // slurmconfig; | 
					
						
							|  |  |  |  |       }; | 
					
						
							| 
									
										
										
										
											2018-06-01 23:42:21 +02:00
										 |  |  |  | 
 | 
					
						
							|  |  |  |  |     submit = | 
					
						
							| 
									
										
										
										
											2018-07-20 20:56:59 +00:00
										 |  |  |  |       { ...}: | 
					
						
							| 
									
										
										
										
											2018-06-01 23:42:21 +02:00
										 |  |  |  |       { | 
					
						
							|  |  |  |  |         networking.firewall.enable = false; | 
					
						
							|  |  |  |  |         services.slurm = { | 
					
						
							|  |  |  |  |           enableStools = true; | 
					
						
							|  |  |  |  |         } // slurmconfig; | 
					
						
							|  |  |  |  |       }; | 
					
						
							|  |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2018-09-15 13:09:36 +02:00
										 |  |  |  |     dbd = | 
					
						
							|  |  |  |  |       { pkgs, ... } : | 
					
						
							|  |  |  |  |       { | 
					
						
							|  |  |  |  |         networking.firewall.enable = false; | 
					
						
							|  |  |  |  |         services.slurm.dbdserver = { | 
					
						
							|  |  |  |  |           enable = true; | 
					
						
							|  |  |  |  |         }; | 
					
						
							|  |  |  |  |         services.mysql = { | 
					
						
							|  |  |  |  |           enable = true; | 
					
						
							|  |  |  |  |           package = pkgs.mysql; | 
					
						
							|  |  |  |  |           ensureDatabases = [ "slurm_acct_db" ]; | 
					
						
							|  |  |  |  |           ensureUsers = [{ | 
					
						
							|  |  |  |  |             ensurePermissions = { "slurm_acct_db.*" = "ALL PRIVILEGES"; }; | 
					
						
							|  |  |  |  |             name = "slurm"; | 
					
						
							|  |  |  |  |           }]; | 
					
						
							| 
									
										
										
										
											2018-11-22 13:21:37 +01:00
										 |  |  |  |           extraOptions = ''
 | 
					
						
							|  |  |  |  |             # recommendations from: https://slurm.schedmd.com/accounting.html#mysql-configuration | 
					
						
							|  |  |  |  |             innodb_buffer_pool_size=1024M | 
					
						
							|  |  |  |  |             innodb_log_file_size=64M | 
					
						
							|  |  |  |  |             innodb_lock_wait_timeout=900 | 
					
						
							|  |  |  |  |           '';
 | 
					
						
							| 
									
										
										
										
											2018-09-15 13:09:36 +02:00
										 |  |  |  |         }; | 
					
						
							|  |  |  |  |       }; | 
					
						
							|  |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-12-25 15:55:07 +01:00
										 |  |  |  |     node1 = computeNode; | 
					
						
							|  |  |  |  |     node2 = computeNode; | 
					
						
							|  |  |  |  |     node3 = computeNode; | 
					
						
							|  |  |  |  |   }; | 
					
						
							|  |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2018-06-01 23:42:21 +02:00
										 |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-12-25 15:55:07 +01:00
										 |  |  |  |   testScript = | 
					
						
							|  |  |  |  |   ''
 | 
					
						
							|  |  |  |  |   startAll; | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |   # Set up authentification across the cluster | 
					
						
							| 
									
										
										
										
											2018-09-15 13:09:36 +02:00
										 |  |  |  |   foreach my $node (($submit,$control,$dbd,$node1,$node2,$node3)) | 
					
						
							| 
									
										
										
										
											2015-12-25 15:55:07 +01:00
										 |  |  |  |   { | 
					
						
							|  |  |  |  |     $node->waitForUnit("default.target"); | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |     $node->succeed("mkdir /etc/munge"); | 
					
						
							|  |  |  |  |     $node->succeed("echo '${mungekey}' > /etc/munge/munge.key"); | 
					
						
							|  |  |  |  |     $node->succeed("chmod 0400 /etc/munge/munge.key"); | 
					
						
							| 
									
										
										
										
											2018-06-09 00:50:28 +02:00
										 |  |  |  |     $node->succeed("chown munge:munge /etc/munge/munge.key"); | 
					
						
							| 
									
										
										
										
											2015-12-25 15:55:07 +01:00
										 |  |  |  |     $node->succeed("systemctl restart munged"); | 
					
						
							| 
									
										
										
										
											2018-09-15 13:09:36 +02:00
										 |  |  |  | 
 | 
					
						
							|  |  |  |  |     $node->waitForUnit("munged"); | 
					
						
							|  |  |  |  |   }; | 
					
						
							| 
									
										
										
										
											2015-12-25 15:55:07 +01:00
										 |  |  |  | 
 | 
					
						
							|  |  |  |  |   # Restart the services since they have probably failed due to the munge init | 
					
						
							|  |  |  |  |   # failure | 
					
						
							| 
									
										
										
										
											2018-09-15 13:09:36 +02:00
										 |  |  |  |   subtest "can_start_slurmdbd", sub { | 
					
						
							|  |  |  |  |     $dbd->succeed("systemctl restart slurmdbd"); | 
					
						
							|  |  |  |  |     $dbd->waitForUnit("slurmdbd.service"); | 
					
						
							| 
									
										
										
										
											2018-12-09 13:36:53 +01:00
										 |  |  |  |     $dbd->waitForOpenPort(6819); | 
					
						
							| 
									
										
										
										
											2018-09-15 13:09:36 +02:00
										 |  |  |  |   }; | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |   # there needs to be an entry for the current | 
					
						
							|  |  |  |  |   # cluster in the database before slurmctld is restarted | 
					
						
							|  |  |  |  |   subtest "add_account", sub { | 
					
						
							|  |  |  |  |     $control->succeed("sacctmgr -i add cluster default"); | 
					
						
							| 
									
										
										
										
											2019-03-09 22:31:40 +01:00
										 |  |  |  |     # check for cluster entry | 
					
						
							|  |  |  |  |     $control->succeed("sacctmgr list cluster | awk '{ print \$1 }' | grep default"); | 
					
						
							| 
									
										
										
										
											2018-09-15 13:09:36 +02:00
										 |  |  |  |   }; | 
					
						
							| 
									
										
										
										
											2015-12-25 15:55:07 +01:00
										 |  |  |  | 
 | 
					
						
							|  |  |  |  |   subtest "can_start_slurmctld", sub { | 
					
						
							|  |  |  |  |     $control->succeed("systemctl restart slurmctld"); | 
					
						
							|  |  |  |  |     $control->waitForUnit("slurmctld.service"); | 
					
						
							|  |  |  |  |   }; | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |   subtest "can_start_slurmd", sub { | 
					
						
							| 
									
										
										
										
											2018-06-01 23:42:21 +02:00
										 |  |  |  |     foreach my $node (($node1,$node2,$node3)) | 
					
						
							| 
									
										
										
										
											2015-12-25 15:55:07 +01:00
										 |  |  |  |     { | 
					
						
							|  |  |  |  |       $node->succeed("systemctl restart slurmd.service"); | 
					
						
							|  |  |  |  |       $node->waitForUnit("slurmd"); | 
					
						
							|  |  |  |  |     } | 
					
						
							|  |  |  |  |   }; | 
					
						
							|  |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2018-09-15 13:09:36 +02:00
										 |  |  |  |   # Test that the cluster works and can distribute jobs; | 
					
						
							| 
									
										
										
										
											2015-12-25 15:55:07 +01:00
										 |  |  |  | 
 | 
					
						
							|  |  |  |  |   subtest "run_distributed_command", sub { | 
					
						
							|  |  |  |  |     # Run `hostname` on 3 nodes of the partition (so on all the 3 nodes). | 
					
						
							|  |  |  |  |     # The output must contain the 3 different names | 
					
						
							| 
									
										
										
										
											2018-06-01 23:42:21 +02:00
										 |  |  |  |     $submit->succeed("srun -N 3 hostname | sort | uniq | wc -l | xargs test 3 -eq"); | 
					
						
							| 
									
										
										
										
											2015-12-25 15:55:07 +01:00
										 |  |  |  |   }; | 
					
						
							| 
									
										
										
										
											2018-09-15 13:09:36 +02:00
										 |  |  |  | 
 | 
					
						
							|  |  |  |  |   subtest "check_slurm_dbd", sub { | 
					
						
							|  |  |  |  |     # find the srun job from above in the database | 
					
						
							| 
									
										
										
										
											2019-06-05 14:14:46 +02:00
										 |  |  |  |     sleep 5; | 
					
						
							|  |  |  |  |     $control->succeed("sacct | grep hostname"); | 
					
						
							| 
									
										
										
										
											2018-09-15 13:09:36 +02:00
										 |  |  |  |   }; | 
					
						
							| 
									
										
										
										
											2015-12-25 15:55:07 +01:00
										 |  |  |  |   '';
 | 
					
						
							|  |  |  |  | }) |