From f795df26cfd0cb746be89f9e004a25d85e0d8465 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20Hamb=C3=BCchen?= Date: Thu, 18 Jun 2020 01:45:11 +0200 Subject: [PATCH 1/8] consul.passthru.tests: Refactor: Extract variable --- nixos/tests/consul.nix | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/nixos/tests/consul.nix b/nixos/tests/consul.nix index 6600dae4770..785963d1a7c 100644 --- a/nixos/tests/consul.nix +++ b/nixos/tests/consul.nix @@ -66,6 +66,7 @@ let services.consul = let thisConsensusServerHost = builtins.elemAt allConsensusServerHosts index; + numConsensusServers = builtins.length allConsensusServerHosts; in assert builtins.elem thisConsensusServerHost allConsensusServerHosts; { @@ -73,12 +74,12 @@ let inherit webUi; extraConfig = defaultExtraConfig // { server = true; - bootstrap_expect = builtins.length allConsensusServerHosts; + bootstrap_expect = numConsensusServers; retry_join = # If there's only 1 node in the network, we allow self-join; # otherwise, the node must not try to join itself, and join only the other servers. # See https://github.com/hashicorp/consul/issues/2868 - if builtins.length allConsensusServerHosts == 1 + if numConsensusServers == 1 then allConsensusServerHosts else builtins.filter (h: h != thisConsensusServerHost) allConsensusServerHosts; bind_addr = ip; From 777d1c0944e931bf5bb3538982c144b8916d1b97 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20Hamb=C3=BCchen?= Date: Thu, 18 Jun 2020 01:48:19 +0200 Subject: [PATCH 2/8] consul.passthru.tests: Refactor let bindings --- nixos/tests/consul.nix | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/nixos/tests/consul.nix b/nixos/tests/consul.nix index 785963d1a7c..a5188d12830 100644 --- a/nixos/tests/consul.nix +++ b/nixos/tests/consul.nix @@ -55,31 +55,28 @@ let server = index: { pkgs, ... }: let - ip = builtins.elemAt allConsensusServerHosts index; + thisConsensusServerHost = builtins.elemAt allConsensusServerHosts index; + ip = thisConsensusServerHost; # since we already use IPs to identify servers in { networking.interfaces.eth1.ipv4.addresses = pkgs.lib.mkOverride 0 [ - { address = builtins.elemAt allConsensusServerHosts index; prefixLength = 16; } + { address = ip; prefixLength = 16; } ]; networking.firewall = firewallSettings; services.consul = - let - thisConsensusServerHost = builtins.elemAt allConsensusServerHosts index; - numConsensusServers = builtins.length allConsensusServerHosts; - in assert builtins.elem thisConsensusServerHost allConsensusServerHosts; { enable = true; inherit webUi; extraConfig = defaultExtraConfig // { server = true; - bootstrap_expect = numConsensusServers; + bootstrap_expect = builtins.length allConsensusServerHosts; retry_join = # If there's only 1 node in the network, we allow self-join; # otherwise, the node must not try to join itself, and join only the other servers. # See https://github.com/hashicorp/consul/issues/2868 - if numConsensusServers == 1 + if builtins.length allConsensusServerHosts == 1 then allConsensusServerHosts else builtins.filter (h: h != thisConsensusServerHost) allConsensusServerHosts; bind_addr = ip; From 25d665634a1bd38515320beabf85a6e23545bac7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20Hamb=C3=BCchen?= Date: Thu, 18 Jun 2020 01:49:24 +0200 Subject: [PATCH 3/8] consul.passthru.tests: Refactor: Extract variable --- nixos/tests/consul.nix | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/nixos/tests/consul.nix b/nixos/tests/consul.nix index a5188d12830..eb7dd45923f 100644 --- a/nixos/tests/consul.nix +++ b/nixos/tests/consul.nix @@ -55,6 +55,7 @@ let server = index: { pkgs, ... }: let + numConsensusServers = builtins.length allConsensusServerHosts; thisConsensusServerHost = builtins.elemAt allConsensusServerHosts index; ip = thisConsensusServerHost; # since we already use IPs to identify servers in @@ -71,12 +72,12 @@ let inherit webUi; extraConfig = defaultExtraConfig // { server = true; - bootstrap_expect = builtins.length allConsensusServerHosts; + bootstrap_expect = numConsensusServers; retry_join = # If there's only 1 node in the network, we allow self-join; # otherwise, the node must not try to join itself, and join only the other servers. # See https://github.com/hashicorp/consul/issues/2868 - if builtins.length allConsensusServerHosts == 1 + if numConsensusServers == 1 then allConsensusServerHosts else builtins.filter (h: h != thisConsensusServerHost) allConsensusServerHosts; bind_addr = ip; From a59a972413cef886eb7b2f048aa8dc08a61bf1a2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20Hamb=C3=BCchen?= Date: Thu, 18 Jun 2020 02:08:17 +0200 Subject: [PATCH 4/8] consul.passthru.tests: Fix failure on current consul. Fixes #90613. Done by setting `autopilot.min_quorum = 3`. Techncially, this would have been required to keep the test correct since Consul's "autopilot" "Dead Server Cleanup" was enabled by default (I believe that was in Consul 0.8). Practically, the issue only occurred with our NixOS test with releases >= `1.7.0-beta2` (see #90613). The setting itself is available since Consul 1.6.2. However, this setting was not documented clearly enough for anybody to notice, and only the upstream issue https://github.com/hashicorp/consul/issues/8118 I filed brought that to light. As explained there, the test could also have been made pass by applying the more correct rolling reboot procedure -m.wait_until_succeeds("[ $(consul members | grep -o alive | wc -l) == 5 ]") +m.wait_until_succeeds( + "[ $(consul operator raft list-peers | grep true | wc -l) == 3 ]" +) but we also intend to test that Consul can regain consensus even if the quorum gets temporarily broken. --- nixos/tests/consul.nix | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/nixos/tests/consul.nix b/nixos/tests/consul.nix index eb7dd45923f..ffbbd835885 100644 --- a/nixos/tests/consul.nix +++ b/nixos/tests/consul.nix @@ -73,6 +73,10 @@ let extraConfig = defaultExtraConfig // { server = true; bootstrap_expect = numConsensusServers; + # Tell Consul that we never intend to drop below this many servers. + # Ensures to not permanently lose consensus after temporary loss. + # See https://github.com/hashicorp/consul/issues/8118#issuecomment-645330040 + autopilot.min_quorum = numConsensusServers; retry_join = # If there's only 1 node in the network, we allow self-join; # otherwise, the node must not try to join itself, and join only the other servers. From 701c0eb4897554a6d783869420195878132475e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20Hamb=C3=BCchen?= Date: Thu, 18 Jun 2020 02:43:11 +0200 Subject: [PATCH 5/8] consul.passthru.tests: Refactor into functions. For better naming and commentary. --- nixos/tests/consul.nix | 64 +++++++++++++++++++++++++++--------------- 1 file changed, 41 insertions(+), 23 deletions(-) diff --git a/nixos/tests/consul.nix b/nixos/tests/consul.nix index ffbbd835885..a3fc9166695 100644 --- a/nixos/tests/consul.nix +++ b/nixos/tests/consul.nix @@ -107,40 +107,58 @@ in { for m in machines: m.wait_for_unit("consul.service") + + def wait_for_healthy_servers(): + for m in machines: + m.wait_until_succeeds("[ $(consul members | grep -o alive | wc -l) == 5 ]") + + + wait_for_healthy_servers() + # Also wait for clients to be alive. for m in machines: m.wait_until_succeeds("[ $(consul members | grep -o alive | wc -l) == 5 ]") client1.succeed("consul kv put testkey 42") client2.succeed("[ $(consul kv get testkey) == 42 ]") - # Test that the cluster can tolearate failures of any single server: - for server in servers: - server.crash() - # For each client, wait until they have connection again - # using `kv get -recurse` before issuing commands. - client1.wait_until_succeeds("consul kv get -recurse") - client2.wait_until_succeeds("consul kv get -recurse") + def rolling_reboot_test(): + """ + Tests that the cluster can tolearate failures of any single server, + following the recommended rolling upgrade procedure from + https://www.consul.io/docs/upgrading#standard-upgrades + """ - # Do some consul actions while one server is down. - client1.succeed("consul kv put testkey 43") - client2.succeed("[ $(consul kv get testkey) == 43 ]") - client2.succeed("consul kv delete testkey") + for server in servers: + server.crash() - # Restart crashed machine. - server.start() + # For each client, wait until they have connection again + # using `kv get -recurse` before issuing commands. + client1.wait_until_succeeds("consul kv get -recurse") + client2.wait_until_succeeds("consul kv get -recurse") - # Wait for recovery. - for m in machines: - m.wait_until_succeeds("[ $(consul members | grep -o alive | wc -l) == 5 ]") + # Do some consul actions while one server is down. + client1.succeed("consul kv put testkey 43") + client2.succeed("[ $(consul kv get testkey) == 43 ]") + client2.succeed("consul kv delete testkey") - # Wait for client connections. - client1.wait_until_succeeds("consul kv get -recurse") - client2.wait_until_succeeds("consul kv get -recurse") + # Restart crashed machine. + server.start() - # Do some consul actions with server back up. - client1.succeed("consul kv put testkey 44") - client2.succeed("[ $(consul kv get testkey) == 44 ]") - client2.succeed("consul kv delete testkey") + # Wait for recovery. + wait_for_healthy_servers() + + # Wait for client connections. + client1.wait_until_succeeds("consul kv get -recurse") + client2.wait_until_succeeds("consul kv get -recurse") + + # Do some consul actions with server back up. + client1.succeed("consul kv put testkey 44") + client2.succeed("[ $(consul kv get testkey) == 44 ]") + client2.succeed("consul kv delete testkey") + + + # Run the tests. + rolling_reboot_test() ''; }) From 811bcbe74a7838244f88462a21828d08bf7cc4b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20Hamb=C3=BCchen?= Date: Thu, 18 Jun 2020 02:45:42 +0200 Subject: [PATCH 6/8] consul.passthru.tests: Use correct server health test. From: https://github.com/hashicorp/consul/issues/8118#issuecomment-645330040 --- nixos/tests/consul.nix | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/nixos/tests/consul.nix b/nixos/tests/consul.nix index a3fc9166695..c6f2ac8b2f6 100644 --- a/nixos/tests/consul.nix +++ b/nixos/tests/consul.nix @@ -109,8 +109,23 @@ in { def wait_for_healthy_servers(): + # See https://github.com/hashicorp/consul/issues/8118#issuecomment-645330040 + # for why the `Voter` column of `list-peers` has that info. + # TODO: The `grep true` relies on the fact that currently in + # the output like + # # consul operator raft list-peers + # Node ID Address State Voter RaftProtocol + # server3 ... 192.168.1.3:8300 leader true 3 + # server2 ... 192.168.1.2:8300 follower true 3 + # server1 ... 192.168.1.1:8300 follower false 3 + # `Voter`is the only boolean column. + # Change this to the more reliable way to be defined by + # https://github.com/hashicorp/consul/issues/8118 + # once that ticket is closed. for m in machines: - m.wait_until_succeeds("[ $(consul members | grep -o alive | wc -l) == 5 ]") + m.wait_until_succeeds( + "[ $(consul operator raft list-peers | grep true | wc -l) == 3 ]" + ) wait_for_healthy_servers() From bcdac2e2fdfe5144a7752f85489d37532a1b332e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20Hamb=C3=BCchen?= Date: Thu, 18 Jun 2020 03:05:54 +0200 Subject: [PATCH 7/8] consul.passthru.tests: Refactor: Extract function --- nixos/tests/consul.nix | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/nixos/tests/consul.nix b/nixos/tests/consul.nix index c6f2ac8b2f6..3e26dcad2ca 100644 --- a/nixos/tests/consul.nix +++ b/nixos/tests/consul.nix @@ -128,10 +128,18 @@ in { ) + def wait_for_all_machines_alive(): + """ + Note that Serf-"alive" does not mean "Raft"-healthy; + see `wait_for_healthy_servers()` for that instead. + """ + for m in machines: + m.wait_until_succeeds("[ $(consul members | grep -o alive | wc -l) == 5 ]") + + wait_for_healthy_servers() # Also wait for clients to be alive. - for m in machines: - m.wait_until_succeeds("[ $(consul members | grep -o alive | wc -l) == 5 ]") + wait_for_all_machines_alive() client1.succeed("consul kv put testkey 42") client2.succeed("[ $(consul kv get testkey) == 42 ]") From b3b27ed008a7813fa03f45f3e9f02aa3b68a450c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20Hamb=C3=BCchen?= Date: Thu, 18 Jun 2020 03:06:24 +0200 Subject: [PATCH 8/8] consul.passthru.tests: Add 2 more tests --- nixos/tests/consul.nix | 50 ++++++++++++++++++++++++++++++++++++++---- 1 file changed, 46 insertions(+), 4 deletions(-) diff --git a/nixos/tests/consul.nix b/nixos/tests/consul.nix index 3e26dcad2ca..ee85f1d0b91 100644 --- a/nixos/tests/consul.nix +++ b/nixos/tests/consul.nix @@ -145,11 +145,16 @@ in { client2.succeed("[ $(consul kv get testkey) == 42 ]") - def rolling_reboot_test(): + def rolling_reboot_test(proper_rolling_procedure=True): """ Tests that the cluster can tolearate failures of any single server, following the recommended rolling upgrade procedure from - https://www.consul.io/docs/upgrading#standard-upgrades + https://www.consul.io/docs/upgrading#standard-upgrades. + + Optionally, `proper_rolling_procedure=False` can be given + to wait only for each server to be back `Healthy`, not `Stable` + in the Raft consensus, see Consul setting `ServerStabilizationTime` and + https://github.com/hashicorp/consul/issues/8118#issuecomment-645330040. """ for server in servers: @@ -168,8 +173,12 @@ in { # Restart crashed machine. server.start() - # Wait for recovery. - wait_for_healthy_servers() + if proper_rolling_procedure: + # Wait for recovery. + wait_for_healthy_servers() + else: + # NOT proper rolling upgrade procedure, see above. + wait_for_all_machines_alive() # Wait for client connections. client1.wait_until_succeeds("consul kv get -recurse") @@ -181,7 +190,40 @@ in { client2.succeed("consul kv delete testkey") + def all_servers_crash_simultaneously_test(): + """ + Tests that the cluster will eventually come back after all + servers crash simultaneously. + """ + + for server in servers: + server.crash() + + for server in servers: + server.start() + + # Wait for recovery. + wait_for_healthy_servers() + + # Wait for client connections. + client1.wait_until_succeeds("consul kv get -recurse") + client2.wait_until_succeeds("consul kv get -recurse") + + # Do some consul actions with servers back up. + client1.succeed("consul kv put testkey 44") + client2.succeed("[ $(consul kv get testkey) == 44 ]") + client2.succeed("consul kv delete testkey") + + # Run the tests. + + print("rolling_reboot_test()") rolling_reboot_test() + + print("all_servers_crash_simultaneously_test()") + all_servers_crash_simultaneously_test() + + print("rolling_reboot_test(proper_rolling_procedure=False)") + rolling_reboot_test(proper_rolling_procedure=False) ''; })