1 import ./make-test-python.nix ({pkgs, lib, ...}:
4 # Settings for both servers and agents
10 inherit retry_interval;
12 inherit raft_multiplier;
16 allConsensusServerHosts = [
22 allConsensusClientHosts = [
28 # See https://www.consul.io/docs/install/ports.html
29 allowedTCPPorts = [ 8301 8302 8600 8500 8300 ];
30 allowedUDPPorts = [ 8301 8302 8600 ];
33 client = index: { pkgs, ... }:
35 ip = builtins.elemAt allConsensusClientHosts index;
38 environment.systemPackages = [ pkgs.consul ];
40 networking.interfaces.eth1.ipv4.addresses = pkgs.lib.mkOverride 0 [
41 { address = ip; prefixLength = 16; }
43 networking.firewall = firewallSettings;
48 extraConfig = defaultExtraConfig // {
50 retry_join = allConsensusServerHosts;
56 server = index: { pkgs, ... }:
58 numConsensusServers = builtins.length allConsensusServerHosts;
59 thisConsensusServerHost = builtins.elemAt allConsensusServerHosts index;
60 ip = thisConsensusServerHost; # since we already use IPs to identify servers
63 networking.interfaces.eth1.ipv4.addresses = pkgs.lib.mkOverride 0 [
64 { address = ip; prefixLength = 16; }
66 networking.firewall = firewallSettings;
69 assert builtins.elem thisConsensusServerHost allConsensusServerHosts;
73 extraConfig = defaultExtraConfig // {
75 bootstrap_expect = numConsensusServers;
76 # Tell Consul that we never intend to drop below this many servers.
77 # Ensures to not permanently lose consensus after temporary loss.
78 # See https://github.com/hashicorp/consul/issues/8118#issuecomment-645330040
79 autopilot.min_quorum = numConsensusServers;
81 # If there's only 1 node in the network, we allow self-join;
82 # otherwise, the node must not try to join itself, and join only the other servers.
83 # See https://github.com/hashicorp/consul/issues/2868
84 if numConsensusServers == 1
85 then allConsensusServerHosts
86 else builtins.filter (h: h != thisConsensusServerHost) allConsensusServerHosts;
104 servers = [server1, server2, server3]
105 machines = [server1, server2, server3, client1, client2]
108 m.wait_for_unit("consul.service")
111 def wait_for_healthy_servers():
112 # See https://github.com/hashicorp/consul/issues/8118#issuecomment-645330040
113 # for why the `Voter` column of `list-peers` has that info.
114 # TODO: The `grep true` relies on the fact that currently in
116 # # consul operator raft list-peers
117 # Node ID Address State Voter RaftProtocol
118 # server3 ... 192.168.1.3:8300 leader true 3
119 # server2 ... 192.168.1.2:8300 follower true 3
120 # server1 ... 192.168.1.1:8300 follower false 3
121 # `Voter`is the only boolean column.
122 # Change this to the more reliable way to be defined by
123 # https://github.com/hashicorp/consul/issues/8118
124 # once that ticket is closed.
126 m.wait_until_succeeds(
127 "[ $(consul operator raft list-peers | grep true | wc -l) == 3 ]"
131 def wait_for_all_machines_alive():
133 Note that Serf-"alive" does not mean "Raft"-healthy;
134 see `wait_for_healthy_servers()` for that instead.
137 m.wait_until_succeeds("[ $(consul members | grep -o alive | wc -l) == 5 ]")
140 wait_for_healthy_servers()
141 # Also wait for clients to be alive.
142 wait_for_all_machines_alive()
144 client1.succeed("consul kv put testkey 42")
145 client2.succeed("[ $(consul kv get testkey) == 42 ]")
148 def rolling_reboot_test(proper_rolling_procedure=True):
150 Tests that the cluster can tolearate failures of any single server,
151 following the recommended rolling upgrade procedure from
152 https://www.consul.io/docs/upgrading#standard-upgrades.
154 Optionally, `proper_rolling_procedure=False` can be given
155 to wait only for each server to be back `Healthy`, not `Stable`
156 in the Raft consensus, see Consul setting `ServerStabilizationTime` and
157 https://github.com/hashicorp/consul/issues/8118#issuecomment-645330040.
160 for server in servers:
163 # For each client, wait until they have connection again
164 # using `kv get -recurse` before issuing commands.
165 client1.wait_until_succeeds("consul kv get -recurse")
166 client2.wait_until_succeeds("consul kv get -recurse")
168 # Do some consul actions while one server is down.
169 client1.succeed("consul kv put testkey 43")
170 client2.succeed("[ $(consul kv get testkey) == 43 ]")
171 client2.succeed("consul kv delete testkey")
173 # Restart crashed machine.
176 if proper_rolling_procedure:
178 wait_for_healthy_servers()
180 # NOT proper rolling upgrade procedure, see above.
181 wait_for_all_machines_alive()
183 # Wait for client connections.
184 client1.wait_until_succeeds("consul kv get -recurse")
185 client2.wait_until_succeeds("consul kv get -recurse")
187 # Do some consul actions with server back up.
188 client1.succeed("consul kv put testkey 44")
189 client2.succeed("[ $(consul kv get testkey) == 44 ]")
190 client2.succeed("consul kv delete testkey")
193 def all_servers_crash_simultaneously_test():
195 Tests that the cluster will eventually come back after all
196 servers crash simultaneously.
199 for server in servers:
202 for server in servers:
206 wait_for_healthy_servers()
208 # Wait for client connections.
209 client1.wait_until_succeeds("consul kv get -recurse")
210 client2.wait_until_succeeds("consul kv get -recurse")
212 # Do some consul actions with servers back up.
213 client1.succeed("consul kv put testkey 44")
214 client2.succeed("[ $(consul kv get testkey) == 44 ]")
215 client2.succeed("consul kv delete testkey")
220 print("rolling_reboot_test()")
221 rolling_reboot_test()
223 print("all_servers_crash_simultaneously_test()")
224 all_servers_crash_simultaneously_test()
226 print("rolling_reboot_test(proper_rolling_procedure=False)")
227 rolling_reboot_test(proper_rolling_procedure=False)