nixos/tests/consul.nix

   1 import ./make-test-python.nix ({pkgs, lib, ...}:
   2
   3 let
   4   # Settings for both servers and agents
   5   webUi = true;
   6   retry_interval = "1s";
   7   raft_multiplier = 1;
   8
   9   defaultExtraConfig = {
  10     inherit retry_interval;
  11     performance = {
  12       inherit raft_multiplier;
  13     };
  14   };
  15
  16   allConsensusServerHosts = [
  17     "192.168.1.1"
  18     "192.168.1.2"
  19     "192.168.1.3"
  20   ];
  21
  22   allConsensusClientHosts = [
  23     "192.168.2.1"
  24     "192.168.2.2"
  25   ];
  26
  27   firewallSettings = {
  28     # See https://www.consul.io/docs/install/ports.html
  29     allowedTCPPorts = [ 8301 8302 8600 8500 8300 ];
  30     allowedUDPPorts = [ 8301 8302 8600 ];
  31   };
  32
  33   client = index: { pkgs, ... }:
  34     let
  35       ip = builtins.elemAt allConsensusClientHosts index;
  36     in
  37       {
  38         environment.systemPackages = [ pkgs.consul ];
  39
  40         networking.interfaces.eth1.ipv4.addresses = pkgs.lib.mkOverride 0 [
  41           { address = ip; prefixLength = 16; }
  42         ];
  43         networking.firewall = firewallSettings;
  44
  45         services.consul = {
  46           enable = true;
  47           inherit webUi;
  48           extraConfig = defaultExtraConfig // {
  49             server = false;
  50             retry_join = allConsensusServerHosts;
  51             bind_addr = ip;
  52           };
  53         };
  54       };
  55
  56   server = index: { pkgs, ... }:
  57     let
  58       numConsensusServers = builtins.length allConsensusServerHosts;
  59       thisConsensusServerHost = builtins.elemAt allConsensusServerHosts index;
  60       ip = thisConsensusServerHost; # since we already use IPs to identify servers
  61     in
  62       {
  63         networking.interfaces.eth1.ipv4.addresses = pkgs.lib.mkOverride 0 [
  64           { address = ip; prefixLength = 16; }
  65         ];
  66         networking.firewall = firewallSettings;
  67
  68         services.consul =
  69           assert builtins.elem thisConsensusServerHost allConsensusServerHosts;
  70           {
  71             enable = true;
  72             inherit webUi;
  73             extraConfig = defaultExtraConfig // {
  74               server = true;
  75               bootstrap_expect = numConsensusServers;
  76               # Tell Consul that we never intend to drop below this many servers.
  77               # Ensures to not permanently lose consensus after temporary loss.
  78               # See https://github.com/hashicorp/consul/issues/8118#issuecomment-645330040
  79               autopilot.min_quorum = numConsensusServers;
  80               retry_join =
  81                 # If there's only 1 node in the network, we allow self-join;
  82                 # otherwise, the node must not try to join itself, and join only the other servers.
  83                 # See https://github.com/hashicorp/consul/issues/2868
  84                 if numConsensusServers == 1
  85                   then allConsensusServerHosts
  86                   else builtins.filter (h: h != thisConsensusServerHost) allConsensusServerHosts;
  87               bind_addr = ip;
  88             };
  89           };
  90       };
  91 in {
  92   name = "consul";
  93
  94   nodes = {
  95     server1 = server 0;
  96     server2 = server 1;
  97     server3 = server 2;
  98
  99     client1 = client 0;
 100     client2 = client 1;
 101   };
 102
 103   testScript = ''
 104     servers = [server1, server2, server3]
 105     machines = [server1, server2, server3, client1, client2]
 106
 107     for m in machines:
 108         m.wait_for_unit("consul.service")
 109
 110
 111     def wait_for_healthy_servers():
 112         # See https://github.com/hashicorp/consul/issues/8118#issuecomment-645330040
 113         # for why the `Voter` column of `list-peers` has that info.
 114         # TODO: The `grep true` relies on the fact that currently in
 115         #       the output like
 116         #           # consul operator raft list-peers
 117         #           Node     ID   Address           State     Voter  RaftProtocol
 118         #           server3  ...  192.168.1.3:8300  leader    true   3
 119         #           server2  ...  192.168.1.2:8300  follower  true   3
 120         #           server1  ...  192.168.1.1:8300  follower  false  3
 121         #       `Voter`is the only boolean column.
 122         #       Change this to the more reliable way to be defined by
 123         #       https://github.com/hashicorp/consul/issues/8118
 124         #       once that ticket is closed.
 125         for m in machines:
 126             m.wait_until_succeeds(
 127                 "[ $(consul operator raft list-peers | grep true | wc -l) == 3 ]"
 128             )
 129
 130
 131     def wait_for_all_machines_alive():
 132         """
 133         Note that Serf-"alive" does not mean "Raft"-healthy;
 134         see `wait_for_healthy_servers()` for that instead.
 135         """
 136         for m in machines:
 137             m.wait_until_succeeds("[ $(consul members | grep -o alive | wc -l) == 5 ]")
 138
 139
 140     wait_for_healthy_servers()
 141     # Also wait for clients to be alive.
 142     wait_for_all_machines_alive()
 143
 144     client1.succeed("consul kv put testkey 42")
 145     client2.succeed("[ $(consul kv get testkey) == 42 ]")
 146
 147
 148     def rolling_reboot_test(proper_rolling_procedure=True):
 149         """
 150         Tests that the cluster can tolearate failures of any single server,
 151         following the recommended rolling upgrade procedure from
 152         https://www.consul.io/docs/upgrading#standard-upgrades.
 153
 154         Optionally, `proper_rolling_procedure=False` can be given
 155         to wait only for each server to be back `Healthy`, not `Stable`
 156         in the Raft consensus, see Consul setting `ServerStabilizationTime` and
 157         https://github.com/hashicorp/consul/issues/8118#issuecomment-645330040.
 158         """
 159
 160         for server in servers:
 161             server.crash()
 162
 163             # For each client, wait until they have connection again
 164             # using `kv get -recurse` before issuing commands.
 165             client1.wait_until_succeeds("consul kv get -recurse")
 166             client2.wait_until_succeeds("consul kv get -recurse")
 167
 168             # Do some consul actions while one server is down.
 169             client1.succeed("consul kv put testkey 43")
 170             client2.succeed("[ $(consul kv get testkey) == 43 ]")
 171             client2.succeed("consul kv delete testkey")
 172
 173             # Restart crashed machine.
 174             server.start()
 175
 176             if proper_rolling_procedure:
 177                 # Wait for recovery.
 178                 wait_for_healthy_servers()
 179             else:
 180                 # NOT proper rolling upgrade procedure, see above.
 181                 wait_for_all_machines_alive()
 182
 183             # Wait for client connections.
 184             client1.wait_until_succeeds("consul kv get -recurse")
 185             client2.wait_until_succeeds("consul kv get -recurse")
 186
 187             # Do some consul actions with server back up.
 188             client1.succeed("consul kv put testkey 44")
 189             client2.succeed("[ $(consul kv get testkey) == 44 ]")
 190             client2.succeed("consul kv delete testkey")
 191
 192
 193     def all_servers_crash_simultaneously_test():
 194         """
 195         Tests that the cluster will eventually come back after all
 196         servers crash simultaneously.
 197         """
 198
 199         for server in servers:
 200             server.crash()
 201
 202         for server in servers:
 203             server.start()
 204
 205         # Wait for recovery.
 206         wait_for_healthy_servers()
 207
 208         # Wait for client connections.
 209         client1.wait_until_succeeds("consul kv get -recurse")
 210         client2.wait_until_succeeds("consul kv get -recurse")
 211
 212         # Do some consul actions with servers back up.
 213         client1.succeed("consul kv put testkey 44")
 214         client2.succeed("[ $(consul kv get testkey) == 44 ]")
 215         client2.succeed("consul kv delete testkey")
 216
 217
 218     # Run the tests.
 219
 220     print("rolling_reboot_test()")
 221     rolling_reboot_test()
 222
 223     print("all_servers_crash_simultaneously_test()")
 224     all_servers_crash_simultaneously_test()
 225
 226     print("rolling_reboot_test(proper_rolling_procedure=False)")
 227     rolling_reboot_test(proper_rolling_procedure=False)
 228   '';
 229 })