nixos/tests/consul.nix

   1 import ./make-test-python.nix ({pkgs, lib, ...}:
   2
   3 let
   4   # Settings for both servers and agents
   5   webUi = true;
   6   retry_interval = "1s";
   7   raft_multiplier = 1;
   8
   9   defaultExtraConfig = {
  10     inherit retry_interval;
  11     performance = {
  12       inherit raft_multiplier;
  13     };
  14   };
  15
  16   allConsensusServerHosts = [
  17     "192.168.1.1"
  18     "192.168.1.2"
  19     "192.168.1.3"
  20   ];
  21
  22   allConsensusClientHosts = [
  23     "192.168.2.1"
  24     "192.168.2.2"
  25   ];
  26
  27   firewallSettings = {
  28     # See https://www.consul.io/docs/install/ports.html
  29     allowedTCPPorts = [ 8301 8302 8600 8500 8300 ];
  30     allowedUDPPorts = [ 8301 8302 8600 ];
  31   };
  32
  33   client = index: { pkgs, ... }:
  34     let
  35       ip = builtins.elemAt allConsensusClientHosts index;
  36     in
  37       {
  38         environment.systemPackages = [ pkgs.consul ];
  39
  40         networking.interfaces.eth1.ipv4.addresses = pkgs.lib.mkOverride 0 [
  41           { address = ip; prefixLength = 16; }
  42         ];
  43         networking.firewall = firewallSettings;
  44
  45         nixpkgs.config.allowUnfreePredicate = pkg: builtins.elem (lib.getName pkg) [ "consul" ];
  46
  47         services.consul = {
  48           enable = true;
  49           inherit webUi;
  50           extraConfig = defaultExtraConfig // {
  51             server = false;
  52             retry_join = allConsensusServerHosts;
  53             bind_addr = ip;
  54           };
  55         };
  56       };
  57
  58   server = index: { pkgs, ... }:
  59     let
  60       numConsensusServers = builtins.length allConsensusServerHosts;
  61       thisConsensusServerHost = builtins.elemAt allConsensusServerHosts index;
  62       ip = thisConsensusServerHost; # since we already use IPs to identify servers
  63     in
  64       {
  65         networking.interfaces.eth1.ipv4.addresses = pkgs.lib.mkOverride 0 [
  66           { address = ip; prefixLength = 16; }
  67         ];
  68         networking.firewall = firewallSettings;
  69
  70         nixpkgs.config.allowUnfreePredicate = pkg: builtins.elem (lib.getName pkg) [ "consul" ];
  71
  72         services.consul =
  73           assert builtins.elem thisConsensusServerHost allConsensusServerHosts;
  74           {
  75             enable = true;
  76             inherit webUi;
  77             extraConfig = defaultExtraConfig // {
  78               server = true;
  79               bootstrap_expect = numConsensusServers;
  80               # Tell Consul that we never intend to drop below this many servers.
  81               # Ensures to not permanently lose consensus after temporary loss.
  82               # See https://github.com/hashicorp/consul/issues/8118#issuecomment-645330040
  83               autopilot.min_quorum = numConsensusServers;
  84               retry_join =
  85                 # If there's only 1 node in the network, we allow self-join;
  86                 # otherwise, the node must not try to join itself, and join only the other servers.
  87                 # See https://github.com/hashicorp/consul/issues/2868
  88                 if numConsensusServers == 1
  89                   then allConsensusServerHosts
  90                   else builtins.filter (h: h != thisConsensusServerHost) allConsensusServerHosts;
  91               bind_addr = ip;
  92             };
  93           };
  94       };
  95 in {
  96   name = "consul";
  97
  98   nodes = {
  99     server1 = server 0;
 100     server2 = server 1;
 101     server3 = server 2;
 102
 103     client1 = client 0;
 104     client2 = client 1;
 105   };
 106
 107   testScript = ''
 108     servers = [server1, server2, server3]
 109     machines = [server1, server2, server3, client1, client2]
 110
 111     for m in machines:
 112         m.wait_for_unit("consul.service")
 113
 114
 115     def wait_for_healthy_servers():
 116         # See https://github.com/hashicorp/consul/issues/8118#issuecomment-645330040
 117         # for why the `Voter` column of `list-peers` has that info.
 118         # TODO: The `grep true` relies on the fact that currently in
 119         #       the output like
 120         #           # consul operator raft list-peers
 121         #           Node     ID   Address           State     Voter  RaftProtocol
 122         #           server3  ...  192.168.1.3:8300  leader    true   3
 123         #           server2  ...  192.168.1.2:8300  follower  true   3
 124         #           server1  ...  192.168.1.1:8300  follower  false  3
 125         #       `Voter`is the only boolean column.
 126         #       Change this to the more reliable way to be defined by
 127         #       https://github.com/hashicorp/consul/issues/8118
 128         #       once that ticket is closed.
 129         for m in machines:
 130             m.wait_until_succeeds(
 131                 "[ $(consul operator raft list-peers | grep true | wc -l) == 3 ]"
 132             )
 133
 134
 135     def wait_for_all_machines_alive():
 136         """
 137         Note that Serf-"alive" does not mean "Raft"-healthy;
 138         see `wait_for_healthy_servers()` for that instead.
 139         """
 140         for m in machines:
 141             m.wait_until_succeeds("[ $(consul members | grep -o alive | wc -l) == 5 ]")
 142
 143
 144     wait_for_healthy_servers()
 145     # Also wait for clients to be alive.
 146     wait_for_all_machines_alive()
 147
 148     client1.succeed("consul kv put testkey 42")
 149     client2.succeed("[ $(consul kv get testkey) == 42 ]")
 150
 151
 152     def rolling_restart_test(proper_rolling_procedure=True):
 153         """
 154         Tests that the cluster can tolearate failures of any single server,
 155         following the recommended rolling upgrade procedure from
 156         https://www.consul.io/docs/upgrading#standard-upgrades.
 157
 158         Optionally, `proper_rolling_procedure=False` can be given
 159         to wait only for each server to be back `Healthy`, not `Stable`
 160         in the Raft consensus, see Consul setting `ServerStabilizationTime` and
 161         https://github.com/hashicorp/consul/issues/8118#issuecomment-645330040.
 162         """
 163
 164         for server in servers:
 165             server.block()
 166             server.systemctl("stop consul")
 167
 168             # Make sure the stopped peer is recognized as being down
 169             client1.wait_until_succeeds(
 170               f"[ $(consul members | grep {server.name} | grep -o -E 'failed|left' | wc -l) == 1 ]"
 171             )
 172
 173             # For each client, wait until they have connection again
 174             # using `kv get -recurse` before issuing commands.
 175             client1.wait_until_succeeds("consul kv get -recurse")
 176             client2.wait_until_succeeds("consul kv get -recurse")
 177
 178             # Do some consul actions while one server is down.
 179             client1.succeed("consul kv put testkey 43")
 180             client2.succeed("[ $(consul kv get testkey) == 43 ]")
 181             client2.succeed("consul kv delete testkey")
 182
 183             server.unblock()
 184             server.systemctl("start consul")
 185
 186             if proper_rolling_procedure:
 187                 # Wait for recovery.
 188                 wait_for_healthy_servers()
 189             else:
 190                 # NOT proper rolling upgrade procedure, see above.
 191                 wait_for_all_machines_alive()
 192
 193             # Wait for client connections.
 194             client1.wait_until_succeeds("consul kv get -recurse")
 195             client2.wait_until_succeeds("consul kv get -recurse")
 196
 197             # Do some consul actions with server back up.
 198             client1.succeed("consul kv put testkey 44")
 199             client2.succeed("[ $(consul kv get testkey) == 44 ]")
 200             client2.succeed("consul kv delete testkey")
 201
 202
 203     def all_servers_crash_simultaneously_test():
 204         """
 205         Tests that the cluster will eventually come back after all
 206         servers crash simultaneously.
 207         """
 208
 209         for server in servers:
 210             server.block()
 211             server.systemctl("stop --no-block consul")
 212
 213         for server in servers:
 214             # --no-block is async, so ensure it has been stopped by now
 215             server.wait_until_fails("systemctl is-active --quiet consul")
 216             server.unblock()
 217             server.systemctl("start consul")
 218
 219         # Wait for recovery.
 220         wait_for_healthy_servers()
 221
 222         # Wait for client connections.
 223         client1.wait_until_succeeds("consul kv get -recurse")
 224         client2.wait_until_succeeds("consul kv get -recurse")
 225
 226         # Do some consul actions with servers back up.
 227         client1.succeed("consul kv put testkey 44")
 228         client2.succeed("[ $(consul kv get testkey) == 44 ]")
 229         client2.succeed("consul kv delete testkey")
 230
 231
 232     # Run the tests.
 233
 234     print("rolling_restart_test()")
 235     rolling_restart_test()
 236
 237     print("all_servers_crash_simultaneously_test()")
 238     all_servers_crash_simultaneously_test()
 239
 240     print("rolling_restart_test(proper_rolling_procedure=False)")
 241     rolling_restart_test(proper_rolling_procedure=False)
 242   '';
 243 })