nixos/tests/consul.nix

   1 import ./make-test-python.nix (
   2   { pkgs, lib, ... }:
   3
   4   let
   5     # Settings for both servers and agents
   6     webUi = true;
   7     retry_interval = "1s";
   8     raft_multiplier = 1;
   9
  10     defaultExtraConfig = {
  11       inherit retry_interval;
  12       performance = {
  13         inherit raft_multiplier;
  14       };
  15     };
  16
  17     allConsensusServerHosts = [
  18       "192.168.1.1"
  19       "192.168.1.2"
  20       "192.168.1.3"
  21     ];
  22
  23     allConsensusClientHosts = [
  24       "192.168.2.1"
  25       "192.168.2.2"
  26     ];
  27
  28     firewallSettings = {
  29       # See https://www.consul.io/docs/install/ports.html
  30       allowedTCPPorts = [
  31         8301
  32         8302
  33         8600
  34         8500
  35         8300
  36       ];
  37       allowedUDPPorts = [
  38         8301
  39         8302
  40         8600
  41       ];
  42     };
  43
  44     client =
  45       index:
  46       { pkgs, ... }:
  47       let
  48         ip = builtins.elemAt allConsensusClientHosts index;
  49       in
  50       {
  51         environment.systemPackages = [ pkgs.consul ];
  52
  53         networking.interfaces.eth1.ipv4.addresses = pkgs.lib.mkOverride 0 [
  54           {
  55             address = ip;
  56             prefixLength = 16;
  57           }
  58         ];
  59         networking.firewall = firewallSettings;
  60
  61         nixpkgs.config.allowUnfreePredicate = pkg: builtins.elem (lib.getName pkg) [ "consul" ];
  62
  63         services.consul = {
  64           enable = true;
  65           inherit webUi;
  66           extraConfig = defaultExtraConfig // {
  67             server = false;
  68             retry_join = allConsensusServerHosts;
  69             bind_addr = ip;
  70           };
  71         };
  72       };
  73
  74     server =
  75       index:
  76       { pkgs, ... }:
  77       let
  78         numConsensusServers = builtins.length allConsensusServerHosts;
  79         thisConsensusServerHost = builtins.elemAt allConsensusServerHosts index;
  80         ip = thisConsensusServerHost; # since we already use IPs to identify servers
  81       in
  82       {
  83         networking.interfaces.eth1.ipv4.addresses = pkgs.lib.mkOverride 0 [
  84           {
  85             address = ip;
  86             prefixLength = 16;
  87           }
  88         ];
  89         networking.firewall = firewallSettings;
  90
  91         nixpkgs.config.allowUnfreePredicate = pkg: builtins.elem (lib.getName pkg) [ "consul" ];
  92
  93         services.consul =
  94           assert builtins.elem thisConsensusServerHost allConsensusServerHosts;
  95           {
  96             enable = true;
  97             inherit webUi;
  98             extraConfig = defaultExtraConfig // {
  99               server = true;
 100               bootstrap_expect = numConsensusServers;
 101               # Tell Consul that we never intend to drop below this many servers.
 102               # Ensures to not permanently lose consensus after temporary loss.
 103               # See https://github.com/hashicorp/consul/issues/8118#issuecomment-645330040
 104               autopilot.min_quorum = numConsensusServers;
 105               retry_join =
 106                 # If there's only 1 node in the network, we allow self-join;
 107                 # otherwise, the node must not try to join itself, and join only the other servers.
 108                 # See https://github.com/hashicorp/consul/issues/2868
 109                 if numConsensusServers == 1 then
 110                   allConsensusServerHosts
 111                 else
 112                   builtins.filter (h: h != thisConsensusServerHost) allConsensusServerHosts;
 113               bind_addr = ip;
 114             };
 115           };
 116       };
 117   in
 118   {
 119     name = "consul";
 120
 121     nodes = {
 122       server1 = server 0;
 123       server2 = server 1;
 124       server3 = server 2;
 125
 126       client1 = client 0;
 127       client2 = client 1;
 128     };
 129
 130     testScript = ''
 131       servers = [server1, server2, server3]
 132       machines = [server1, server2, server3, client1, client2]
 133
 134       for m in machines:
 135           m.wait_for_unit("consul.service")
 136
 137
 138       def wait_for_healthy_servers():
 139           # See https://github.com/hashicorp/consul/issues/8118#issuecomment-645330040
 140           # for why the `Voter` column of `list-peers` has that info.
 141           # TODO: The `grep true` relies on the fact that currently in
 142           #       the output like
 143           #           # consul operator raft list-peers
 144           #           Node     ID   Address           State     Voter  RaftProtocol
 145           #           server3  ...  192.168.1.3:8300  leader    true   3
 146           #           server2  ...  192.168.1.2:8300  follower  true   3
 147           #           server1  ...  192.168.1.1:8300  follower  false  3
 148           #       `Voter`is the only boolean column.
 149           #       Change this to the more reliable way to be defined by
 150           #       https://github.com/hashicorp/consul/issues/8118
 151           #       once that ticket is closed.
 152           for m in machines:
 153               m.wait_until_succeeds(
 154                   "[ $(consul operator raft list-peers | grep true | wc -l) == 3 ]"
 155               )
 156
 157
 158       def wait_for_all_machines_alive():
 159           """
 160           Note that Serf-"alive" does not mean "Raft"-healthy;
 161           see `wait_for_healthy_servers()` for that instead.
 162           """
 163           for m in machines:
 164               m.wait_until_succeeds("[ $(consul members | grep -o alive | wc -l) == 5 ]")
 165
 166
 167       wait_for_healthy_servers()
 168       # Also wait for clients to be alive.
 169       wait_for_all_machines_alive()
 170
 171       client1.succeed("consul kv put testkey 42")
 172       client2.succeed("[ $(consul kv get testkey) == 42 ]")
 173
 174
 175       def rolling_restart_test(proper_rolling_procedure=True):
 176           """
 177           Tests that the cluster can tolearate failures of any single server,
 178           following the recommended rolling upgrade procedure from
 179           https://www.consul.io/docs/upgrading#standard-upgrades.
 180
 181           Optionally, `proper_rolling_procedure=False` can be given
 182           to wait only for each server to be back `Healthy`, not `Stable`
 183           in the Raft consensus, see Consul setting `ServerStabilizationTime` and
 184           https://github.com/hashicorp/consul/issues/8118#issuecomment-645330040.
 185           """
 186
 187           for server in servers:
 188               server.block()
 189               server.systemctl("stop consul")
 190
 191               # Make sure the stopped peer is recognized as being down
 192               client1.wait_until_succeeds(
 193                 f"[ $(consul members | grep {server.name} | grep -o -E 'failed|left' | wc -l) == 1 ]"
 194               )
 195
 196               # For each client, wait until they have connection again
 197               # using `kv get -recurse` before issuing commands.
 198               client1.wait_until_succeeds("consul kv get -recurse")
 199               client2.wait_until_succeeds("consul kv get -recurse")
 200
 201               # Do some consul actions while one server is down.
 202               client1.succeed("consul kv put testkey 43")
 203               client2.succeed("[ $(consul kv get testkey) == 43 ]")
 204               client2.succeed("consul kv delete testkey")
 205
 206               server.unblock()
 207               server.systemctl("start consul")
 208
 209               if proper_rolling_procedure:
 210                   # Wait for recovery.
 211                   wait_for_healthy_servers()
 212               else:
 213                   # NOT proper rolling upgrade procedure, see above.
 214                   wait_for_all_machines_alive()
 215
 216               # Wait for client connections.
 217               client1.wait_until_succeeds("consul kv get -recurse")
 218               client2.wait_until_succeeds("consul kv get -recurse")
 219
 220               # Do some consul actions with server back up.
 221               client1.succeed("consul kv put testkey 44")
 222               client2.succeed("[ $(consul kv get testkey) == 44 ]")
 223               client2.succeed("consul kv delete testkey")
 224
 225
 226       def all_servers_crash_simultaneously_test():
 227           """
 228           Tests that the cluster will eventually come back after all
 229           servers crash simultaneously.
 230           """
 231
 232           for server in servers:
 233               server.block()
 234               server.systemctl("stop --no-block consul")
 235
 236           for server in servers:
 237               # --no-block is async, so ensure it has been stopped by now
 238               server.wait_until_fails("systemctl is-active --quiet consul")
 239               server.unblock()
 240               server.systemctl("start consul")
 241
 242           # Wait for recovery.
 243           wait_for_healthy_servers()
 244
 245           # Wait for client connections.
 246           client1.wait_until_succeeds("consul kv get -recurse")
 247           client2.wait_until_succeeds("consul kv get -recurse")
 248
 249           # Do some consul actions with servers back up.
 250           client1.succeed("consul kv put testkey 44")
 251           client2.succeed("[ $(consul kv get testkey) == 44 ]")
 252           client2.succeed("consul kv delete testkey")
 253
 254
 255       # Run the tests.
 256
 257       print("rolling_restart_test()")
 258       rolling_restart_test()
 259
 260       print("all_servers_crash_simultaneously_test()")
 261       all_servers_crash_simultaneously_test()
 262
 263       print("rolling_restart_test(proper_rolling_procedure=False)")
 264       rolling_restart_test(proper_rolling_procedure=False)
 265     '';
 266   }
 267 )