230 lines
		
	
	
		
			7.5 KiB
		
	
	
	
		
			Nix
		
	
	
	
	
	
			
		
		
	
	
			230 lines
		
	
	
		
			7.5 KiB
		
	
	
	
		
			Nix
		
	
	
	
	
	
| import ./make-test-python.nix ({pkgs, lib, ...}:
 | |
| 
 | |
| let
 | |
|   # Settings for both servers and agents
 | |
|   webUi = true;
 | |
|   retry_interval = "1s";
 | |
|   raft_multiplier = 1;
 | |
| 
 | |
|   defaultExtraConfig = {
 | |
|     inherit retry_interval;
 | |
|     performance = {
 | |
|       inherit raft_multiplier;
 | |
|     };
 | |
|   };
 | |
| 
 | |
|   allConsensusServerHosts = [
 | |
|     "192.168.1.1"
 | |
|     "192.168.1.2"
 | |
|     "192.168.1.3"
 | |
|   ];
 | |
| 
 | |
|   allConsensusClientHosts = [
 | |
|     "192.168.2.1"
 | |
|     "192.168.2.2"
 | |
|   ];
 | |
| 
 | |
|   firewallSettings = {
 | |
|     # See https://www.consul.io/docs/install/ports.html
 | |
|     allowedTCPPorts = [ 8301 8302 8600 8500 8300 ];
 | |
|     allowedUDPPorts = [ 8301 8302 8600 ];
 | |
|   };
 | |
| 
 | |
|   client = index: { pkgs, ... }:
 | |
|     let
 | |
|       ip = builtins.elemAt allConsensusClientHosts index;
 | |
|     in
 | |
|       {
 | |
|         environment.systemPackages = [ pkgs.consul ];
 | |
| 
 | |
|         networking.interfaces.eth1.ipv4.addresses = pkgs.lib.mkOverride 0 [
 | |
|           { address = ip; prefixLength = 16; }
 | |
|         ];
 | |
|         networking.firewall = firewallSettings;
 | |
| 
 | |
|         services.consul = {
 | |
|           enable = true;
 | |
|           inherit webUi;
 | |
|           extraConfig = defaultExtraConfig // {
 | |
|             server = false;
 | |
|             retry_join = allConsensusServerHosts;
 | |
|             bind_addr = ip;
 | |
|           };
 | |
|         };
 | |
|       };
 | |
| 
 | |
|   server = index: { pkgs, ... }:
 | |
|     let
 | |
|       numConsensusServers = builtins.length allConsensusServerHosts;
 | |
|       thisConsensusServerHost = builtins.elemAt allConsensusServerHosts index;
 | |
|       ip = thisConsensusServerHost; # since we already use IPs to identify servers
 | |
|     in
 | |
|       {
 | |
|         networking.interfaces.eth1.ipv4.addresses = pkgs.lib.mkOverride 0 [
 | |
|           { address = ip; prefixLength = 16; }
 | |
|         ];
 | |
|         networking.firewall = firewallSettings;
 | |
| 
 | |
|         services.consul =
 | |
|           assert builtins.elem thisConsensusServerHost allConsensusServerHosts;
 | |
|           {
 | |
|             enable = true;
 | |
|             inherit webUi;
 | |
|             extraConfig = defaultExtraConfig // {
 | |
|               server = true;
 | |
|               bootstrap_expect = numConsensusServers;
 | |
|               # Tell Consul that we never intend to drop below this many servers.
 | |
|               # Ensures to not permanently lose consensus after temporary loss.
 | |
|               # See https://github.com/hashicorp/consul/issues/8118#issuecomment-645330040
 | |
|               autopilot.min_quorum = numConsensusServers;
 | |
|               retry_join =
 | |
|                 # If there's only 1 node in the network, we allow self-join;
 | |
|                 # otherwise, the node must not try to join itself, and join only the other servers.
 | |
|                 # See https://github.com/hashicorp/consul/issues/2868
 | |
|                 if numConsensusServers == 1
 | |
|                   then allConsensusServerHosts
 | |
|                   else builtins.filter (h: h != thisConsensusServerHost) allConsensusServerHosts;
 | |
|               bind_addr = ip;
 | |
|             };
 | |
|           };
 | |
|       };
 | |
| in {
 | |
|   name = "consul";
 | |
| 
 | |
|   nodes = {
 | |
|     server1 = server 0;
 | |
|     server2 = server 1;
 | |
|     server3 = server 2;
 | |
| 
 | |
|     client1 = client 0;
 | |
|     client2 = client 1;
 | |
|   };
 | |
| 
 | |
|   testScript = ''
 | |
|     servers = [server1, server2, server3]
 | |
|     machines = [server1, server2, server3, client1, client2]
 | |
| 
 | |
|     for m in machines:
 | |
|         m.wait_for_unit("consul.service")
 | |
| 
 | |
| 
 | |
|     def wait_for_healthy_servers():
 | |
|         # See https://github.com/hashicorp/consul/issues/8118#issuecomment-645330040
 | |
|         # for why the `Voter` column of `list-peers` has that info.
 | |
|         # TODO: The `grep true` relies on the fact that currently in
 | |
|         #       the output like
 | |
|         #           # consul operator raft list-peers
 | |
|         #           Node     ID   Address           State     Voter  RaftProtocol
 | |
|         #           server3  ...  192.168.1.3:8300  leader    true   3
 | |
|         #           server2  ...  192.168.1.2:8300  follower  true   3
 | |
|         #           server1  ...  192.168.1.1:8300  follower  false  3
 | |
|         #       `Voter`is the only boolean column.
 | |
|         #       Change this to the more reliable way to be defined by
 | |
|         #       https://github.com/hashicorp/consul/issues/8118
 | |
|         #       once that ticket is closed.
 | |
|         for m in machines:
 | |
|             m.wait_until_succeeds(
 | |
|                 "[ $(consul operator raft list-peers | grep true | wc -l) == 3 ]"
 | |
|             )
 | |
| 
 | |
| 
 | |
|     def wait_for_all_machines_alive():
 | |
|         """
 | |
|         Note that Serf-"alive" does not mean "Raft"-healthy;
 | |
|         see `wait_for_healthy_servers()` for that instead.
 | |
|         """
 | |
|         for m in machines:
 | |
|             m.wait_until_succeeds("[ $(consul members | grep -o alive | wc -l) == 5 ]")
 | |
| 
 | |
| 
 | |
|     wait_for_healthy_servers()
 | |
|     # Also wait for clients to be alive.
 | |
|     wait_for_all_machines_alive()
 | |
| 
 | |
|     client1.succeed("consul kv put testkey 42")
 | |
|     client2.succeed("[ $(consul kv get testkey) == 42 ]")
 | |
| 
 | |
| 
 | |
|     def rolling_reboot_test(proper_rolling_procedure=True):
 | |
|         """
 | |
|         Tests that the cluster can tolearate failures of any single server,
 | |
|         following the recommended rolling upgrade procedure from
 | |
|         https://www.consul.io/docs/upgrading#standard-upgrades.
 | |
| 
 | |
|         Optionally, `proper_rolling_procedure=False` can be given
 | |
|         to wait only for each server to be back `Healthy`, not `Stable`
 | |
|         in the Raft consensus, see Consul setting `ServerStabilizationTime` and
 | |
|         https://github.com/hashicorp/consul/issues/8118#issuecomment-645330040.
 | |
|         """
 | |
| 
 | |
|         for server in servers:
 | |
|             server.crash()
 | |
| 
 | |
|             # For each client, wait until they have connection again
 | |
|             # using `kv get -recurse` before issuing commands.
 | |
|             client1.wait_until_succeeds("consul kv get -recurse")
 | |
|             client2.wait_until_succeeds("consul kv get -recurse")
 | |
| 
 | |
|             # Do some consul actions while one server is down.
 | |
|             client1.succeed("consul kv put testkey 43")
 | |
|             client2.succeed("[ $(consul kv get testkey) == 43 ]")
 | |
|             client2.succeed("consul kv delete testkey")
 | |
| 
 | |
|             # Restart crashed machine.
 | |
|             server.start()
 | |
| 
 | |
|             if proper_rolling_procedure:
 | |
|                 # Wait for recovery.
 | |
|                 wait_for_healthy_servers()
 | |
|             else:
 | |
|                 # NOT proper rolling upgrade procedure, see above.
 | |
|                 wait_for_all_machines_alive()
 | |
| 
 | |
|             # Wait for client connections.
 | |
|             client1.wait_until_succeeds("consul kv get -recurse")
 | |
|             client2.wait_until_succeeds("consul kv get -recurse")
 | |
| 
 | |
|             # Do some consul actions with server back up.
 | |
|             client1.succeed("consul kv put testkey 44")
 | |
|             client2.succeed("[ $(consul kv get testkey) == 44 ]")
 | |
|             client2.succeed("consul kv delete testkey")
 | |
| 
 | |
| 
 | |
|     def all_servers_crash_simultaneously_test():
 | |
|         """
 | |
|         Tests that the cluster will eventually come back after all
 | |
|         servers crash simultaneously.
 | |
|         """
 | |
| 
 | |
|         for server in servers:
 | |
|             server.crash()
 | |
| 
 | |
|         for server in servers:
 | |
|             server.start()
 | |
| 
 | |
|         # Wait for recovery.
 | |
|         wait_for_healthy_servers()
 | |
| 
 | |
|         # Wait for client connections.
 | |
|         client1.wait_until_succeeds("consul kv get -recurse")
 | |
|         client2.wait_until_succeeds("consul kv get -recurse")
 | |
| 
 | |
|         # Do some consul actions with servers back up.
 | |
|         client1.succeed("consul kv put testkey 44")
 | |
|         client2.succeed("[ $(consul kv get testkey) == 44 ]")
 | |
|         client2.succeed("consul kv delete testkey")
 | |
| 
 | |
| 
 | |
|     # Run the tests.
 | |
| 
 | |
|     print("rolling_reboot_test()")
 | |
|     rolling_reboot_test()
 | |
| 
 | |
|     print("all_servers_crash_simultaneously_test()")
 | |
|     all_servers_crash_simultaneously_test()
 | |
| 
 | |
|     print("rolling_reboot_test(proper_rolling_procedure=False)")
 | |
|     rolling_reboot_test(proper_rolling_procedure=False)
 | |
|   '';
 | |
| })
 | 
