Merge pull request #90701 from nh2/issue-90613-fix-consul-reboot-test
consul.passthru.tests: Fix failure on current consul versions, add more tests
This commit is contained in:
commit
5c5f7a22fe
|
@ -55,30 +55,33 @@ let
|
||||||
|
|
||||||
server = index: { pkgs, ... }:
|
server = index: { pkgs, ... }:
|
||||||
let
|
let
|
||||||
ip = builtins.elemAt allConsensusServerHosts index;
|
numConsensusServers = builtins.length allConsensusServerHosts;
|
||||||
|
thisConsensusServerHost = builtins.elemAt allConsensusServerHosts index;
|
||||||
|
ip = thisConsensusServerHost; # since we already use IPs to identify servers
|
||||||
in
|
in
|
||||||
{
|
{
|
||||||
networking.interfaces.eth1.ipv4.addresses = pkgs.lib.mkOverride 0 [
|
networking.interfaces.eth1.ipv4.addresses = pkgs.lib.mkOverride 0 [
|
||||||
{ address = builtins.elemAt allConsensusServerHosts index; prefixLength = 16; }
|
{ address = ip; prefixLength = 16; }
|
||||||
];
|
];
|
||||||
networking.firewall = firewallSettings;
|
networking.firewall = firewallSettings;
|
||||||
|
|
||||||
services.consul =
|
services.consul =
|
||||||
let
|
|
||||||
thisConsensusServerHost = builtins.elemAt allConsensusServerHosts index;
|
|
||||||
in
|
|
||||||
assert builtins.elem thisConsensusServerHost allConsensusServerHosts;
|
assert builtins.elem thisConsensusServerHost allConsensusServerHosts;
|
||||||
{
|
{
|
||||||
enable = true;
|
enable = true;
|
||||||
inherit webUi;
|
inherit webUi;
|
||||||
extraConfig = defaultExtraConfig // {
|
extraConfig = defaultExtraConfig // {
|
||||||
server = true;
|
server = true;
|
||||||
bootstrap_expect = builtins.length allConsensusServerHosts;
|
bootstrap_expect = numConsensusServers;
|
||||||
|
# Tell Consul that we never intend to drop below this many servers.
|
||||||
|
# Ensures to not permanently lose consensus after temporary loss.
|
||||||
|
# See https://github.com/hashicorp/consul/issues/8118#issuecomment-645330040
|
||||||
|
autopilot.min_quorum = numConsensusServers;
|
||||||
retry_join =
|
retry_join =
|
||||||
# If there's only 1 node in the network, we allow self-join;
|
# If there's only 1 node in the network, we allow self-join;
|
||||||
# otherwise, the node must not try to join itself, and join only the other servers.
|
# otherwise, the node must not try to join itself, and join only the other servers.
|
||||||
# See https://github.com/hashicorp/consul/issues/2868
|
# See https://github.com/hashicorp/consul/issues/2868
|
||||||
if builtins.length allConsensusServerHosts == 1
|
if numConsensusServers == 1
|
||||||
then allConsensusServerHosts
|
then allConsensusServerHosts
|
||||||
else builtins.filter (h: h != thisConsensusServerHost) allConsensusServerHosts;
|
else builtins.filter (h: h != thisConsensusServerHost) allConsensusServerHosts;
|
||||||
bind_addr = ip;
|
bind_addr = ip;
|
||||||
|
@ -104,13 +107,56 @@ in {
|
||||||
for m in machines:
|
for m in machines:
|
||||||
m.wait_for_unit("consul.service")
|
m.wait_for_unit("consul.service")
|
||||||
|
|
||||||
|
|
||||||
|
def wait_for_healthy_servers():
|
||||||
|
# See https://github.com/hashicorp/consul/issues/8118#issuecomment-645330040
|
||||||
|
# for why the `Voter` column of `list-peers` has that info.
|
||||||
|
# TODO: The `grep true` relies on the fact that currently in
|
||||||
|
# the output like
|
||||||
|
# # consul operator raft list-peers
|
||||||
|
# Node ID Address State Voter RaftProtocol
|
||||||
|
# server3 ... 192.168.1.3:8300 leader true 3
|
||||||
|
# server2 ... 192.168.1.2:8300 follower true 3
|
||||||
|
# server1 ... 192.168.1.1:8300 follower false 3
|
||||||
|
# `Voter`is the only boolean column.
|
||||||
|
# Change this to the more reliable way to be defined by
|
||||||
|
# https://github.com/hashicorp/consul/issues/8118
|
||||||
|
# once that ticket is closed.
|
||||||
|
for m in machines:
|
||||||
|
m.wait_until_succeeds(
|
||||||
|
"[ $(consul operator raft list-peers | grep true | wc -l) == 3 ]"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def wait_for_all_machines_alive():
|
||||||
|
"""
|
||||||
|
Note that Serf-"alive" does not mean "Raft"-healthy;
|
||||||
|
see `wait_for_healthy_servers()` for that instead.
|
||||||
|
"""
|
||||||
for m in machines:
|
for m in machines:
|
||||||
m.wait_until_succeeds("[ $(consul members | grep -o alive | wc -l) == 5 ]")
|
m.wait_until_succeeds("[ $(consul members | grep -o alive | wc -l) == 5 ]")
|
||||||
|
|
||||||
|
|
||||||
|
wait_for_healthy_servers()
|
||||||
|
# Also wait for clients to be alive.
|
||||||
|
wait_for_all_machines_alive()
|
||||||
|
|
||||||
client1.succeed("consul kv put testkey 42")
|
client1.succeed("consul kv put testkey 42")
|
||||||
client2.succeed("[ $(consul kv get testkey) == 42 ]")
|
client2.succeed("[ $(consul kv get testkey) == 42 ]")
|
||||||
|
|
||||||
# Test that the cluster can tolearate failures of any single server:
|
|
||||||
|
def rolling_reboot_test(proper_rolling_procedure=True):
|
||||||
|
"""
|
||||||
|
Tests that the cluster can tolearate failures of any single server,
|
||||||
|
following the recommended rolling upgrade procedure from
|
||||||
|
https://www.consul.io/docs/upgrading#standard-upgrades.
|
||||||
|
|
||||||
|
Optionally, `proper_rolling_procedure=False` can be given
|
||||||
|
to wait only for each server to be back `Healthy`, not `Stable`
|
||||||
|
in the Raft consensus, see Consul setting `ServerStabilizationTime` and
|
||||||
|
https://github.com/hashicorp/consul/issues/8118#issuecomment-645330040.
|
||||||
|
"""
|
||||||
|
|
||||||
for server in servers:
|
for server in servers:
|
||||||
server.crash()
|
server.crash()
|
||||||
|
|
||||||
|
@ -127,9 +173,12 @@ in {
|
||||||
# Restart crashed machine.
|
# Restart crashed machine.
|
||||||
server.start()
|
server.start()
|
||||||
|
|
||||||
|
if proper_rolling_procedure:
|
||||||
# Wait for recovery.
|
# Wait for recovery.
|
||||||
for m in machines:
|
wait_for_healthy_servers()
|
||||||
m.wait_until_succeeds("[ $(consul members | grep -o alive | wc -l) == 5 ]")
|
else:
|
||||||
|
# NOT proper rolling upgrade procedure, see above.
|
||||||
|
wait_for_all_machines_alive()
|
||||||
|
|
||||||
# Wait for client connections.
|
# Wait for client connections.
|
||||||
client1.wait_until_succeeds("consul kv get -recurse")
|
client1.wait_until_succeeds("consul kv get -recurse")
|
||||||
|
@ -139,5 +188,42 @@ in {
|
||||||
client1.succeed("consul kv put testkey 44")
|
client1.succeed("consul kv put testkey 44")
|
||||||
client2.succeed("[ $(consul kv get testkey) == 44 ]")
|
client2.succeed("[ $(consul kv get testkey) == 44 ]")
|
||||||
client2.succeed("consul kv delete testkey")
|
client2.succeed("consul kv delete testkey")
|
||||||
|
|
||||||
|
|
||||||
|
def all_servers_crash_simultaneously_test():
|
||||||
|
"""
|
||||||
|
Tests that the cluster will eventually come back after all
|
||||||
|
servers crash simultaneously.
|
||||||
|
"""
|
||||||
|
|
||||||
|
for server in servers:
|
||||||
|
server.crash()
|
||||||
|
|
||||||
|
for server in servers:
|
||||||
|
server.start()
|
||||||
|
|
||||||
|
# Wait for recovery.
|
||||||
|
wait_for_healthy_servers()
|
||||||
|
|
||||||
|
# Wait for client connections.
|
||||||
|
client1.wait_until_succeeds("consul kv get -recurse")
|
||||||
|
client2.wait_until_succeeds("consul kv get -recurse")
|
||||||
|
|
||||||
|
# Do some consul actions with servers back up.
|
||||||
|
client1.succeed("consul kv put testkey 44")
|
||||||
|
client2.succeed("[ $(consul kv get testkey) == 44 ]")
|
||||||
|
client2.succeed("consul kv delete testkey")
|
||||||
|
|
||||||
|
|
||||||
|
# Run the tests.
|
||||||
|
|
||||||
|
print("rolling_reboot_test()")
|
||||||
|
rolling_reboot_test()
|
||||||
|
|
||||||
|
print("all_servers_crash_simultaneously_test()")
|
||||||
|
all_servers_crash_simultaneously_test()
|
||||||
|
|
||||||
|
print("rolling_reboot_test(proper_rolling_procedure=False)")
|
||||||
|
rolling_reboot_test(proper_rolling_procedure=False)
|
||||||
'';
|
'';
|
||||||
})
|
})
|
||||||
|
|
Loading…
Reference in New Issue