From 4994f0f7d062f932ef36ca6e36d5556334c4c93a Mon Sep 17 00:00:00 2001 From: Lancelot SIX Date: Fri, 25 Dec 2015 15:55:07 +0100 Subject: [PATCH] slurm service: add tests --- nixos/tests/slurm.nix | 80 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) create mode 100644 nixos/tests/slurm.nix diff --git a/nixos/tests/slurm.nix b/nixos/tests/slurm.nix new file mode 100644 index 00000000000..0dd00dfb04c --- /dev/null +++ b/nixos/tests/slurm.nix @@ -0,0 +1,80 @@ +import ./make-test.nix ({ pkgs, ... }: +let mungekey = "mungeverryweakkeybuteasytointegratoinatest"; + slurmconfig = { + client.enable = true; + controlMachine = "control"; + nodeName = '' + control + NodeName=node[1-3] CPUs=1 State=UNKNOWN + ''; + partitionName = "debug Nodes=node[1-3] Default=YES MaxTime=INFINITE State=UP"; + }; +in { + name = "slurm"; + + nodes = + let + computeNode = + { config, pkgs, ...}: + { + # TODO slrumd port and slurmctld port should be configurations and + # automatically allowed by the firewall. + networking.firewall.enable = false; + services.munge.enable = true; + services.slurm = slurmconfig; + }; + in { + control = + { config, pkgs, ...}: + { + networking.firewall.enable = false; + services.munge.enable = true; + services.slurm = { + server.enable = true; + } // slurmconfig; + }; + node1 = computeNode; + node2 = computeNode; + node3 = computeNode; + }; + + testScript = + '' + startAll; + + # Set up authentification across the cluster + foreach my $node (($control,$node1,$node2,$node3)) + { + $node->waitForUnit("default.target"); + + $node->succeed("mkdir /etc/munge"); + $node->succeed("echo '${mungekey}' > /etc/munge/munge.key"); + $node->succeed("chmod 0400 /etc/munge/munge.key"); + $node->succeed("systemctl restart munged"); + } + + # Restart the services since they have probably failed due to the munge init + # failure + + subtest "can_start_slurmctld", sub { + $control->succeed("systemctl restart slurmctld"); + $control->waitForUnit("slurmctld.service"); + }; + + subtest "can_start_slurmd", sub { + foreach my $node (($control,$node1,$node2,$node3)) + { + $node->succeed("systemctl restart slurmd.service"); + $node->waitForUnit("slurmd"); + } + }; + + # Test that the cluster work and can distribute jobs; + + subtest "run_distributed_command", sub { + # Run `hostname` on 3 nodes of the partition (so on all the 3 nodes). + # The output must contain the 3 different names + $control->succeed("srun -N 3 hostname | sort | uniq | wc -l | xargs test 3 -eq"); + }; + ''; +})