1 import ./make-test-python.nix ({pkgs, lib, ...}:
4 # Settings for both servers and agents
10 inherit retry_interval;
12 inherit raft_multiplier;
16 allConsensusServerHosts = [
22 allConsensusClientHosts = [
28 # See https://www.consul.io/docs/install/ports.html
29 allowedTCPPorts = [ 8301 8302 8600 8500 8300 ];
30 allowedUDPPorts = [ 8301 8302 8600 ];
33 client = index: { pkgs, ... }:
35 ip = builtins.elemAt allConsensusClientHosts index;
38 environment.systemPackages = [ pkgs.consul ];
40 networking.interfaces.eth1.ipv4.addresses = pkgs.lib.mkOverride 0 [
41 { address = ip; prefixLength = 16; }
43 networking.firewall = firewallSettings;
45 nixpkgs.config.allowUnfreePredicate = pkg: builtins.elem (lib.getName pkg) [ "consul" ];
50 extraConfig = defaultExtraConfig // {
52 retry_join = allConsensusServerHosts;
58 server = index: { pkgs, ... }:
60 numConsensusServers = builtins.length allConsensusServerHosts;
61 thisConsensusServerHost = builtins.elemAt allConsensusServerHosts index;
62 ip = thisConsensusServerHost; # since we already use IPs to identify servers
65 networking.interfaces.eth1.ipv4.addresses = pkgs.lib.mkOverride 0 [
66 { address = ip; prefixLength = 16; }
68 networking.firewall = firewallSettings;
70 nixpkgs.config.allowUnfreePredicate = pkg: builtins.elem (lib.getName pkg) [ "consul" ];
73 assert builtins.elem thisConsensusServerHost allConsensusServerHosts;
77 extraConfig = defaultExtraConfig // {
79 bootstrap_expect = numConsensusServers;
80 # Tell Consul that we never intend to drop below this many servers.
81 # Ensures to not permanently lose consensus after temporary loss.
82 # See https://github.com/hashicorp/consul/issues/8118#issuecomment-645330040
83 autopilot.min_quorum = numConsensusServers;
85 # If there's only 1 node in the network, we allow self-join;
86 # otherwise, the node must not try to join itself, and join only the other servers.
87 # See https://github.com/hashicorp/consul/issues/2868
88 if numConsensusServers == 1
89 then allConsensusServerHosts
90 else builtins.filter (h: h != thisConsensusServerHost) allConsensusServerHosts;
108 servers = [server1, server2, server3]
109 machines = [server1, server2, server3, client1, client2]
112 m.wait_for_unit("consul.service")
115 def wait_for_healthy_servers():
116 # See https://github.com/hashicorp/consul/issues/8118#issuecomment-645330040
117 # for why the `Voter` column of `list-peers` has that info.
118 # TODO: The `grep true` relies on the fact that currently in
120 # # consul operator raft list-peers
121 # Node ID Address State Voter RaftProtocol
122 # server3 ... 192.168.1.3:8300 leader true 3
123 # server2 ... 192.168.1.2:8300 follower true 3
124 # server1 ... 192.168.1.1:8300 follower false 3
125 # `Voter`is the only boolean column.
126 # Change this to the more reliable way to be defined by
127 # https://github.com/hashicorp/consul/issues/8118
128 # once that ticket is closed.
130 m.wait_until_succeeds(
131 "[ $(consul operator raft list-peers | grep true | wc -l) == 3 ]"
135 def wait_for_all_machines_alive():
137 Note that Serf-"alive" does not mean "Raft"-healthy;
138 see `wait_for_healthy_servers()` for that instead.
141 m.wait_until_succeeds("[ $(consul members | grep -o alive | wc -l) == 5 ]")
144 wait_for_healthy_servers()
145 # Also wait for clients to be alive.
146 wait_for_all_machines_alive()
148 client1.succeed("consul kv put testkey 42")
149 client2.succeed("[ $(consul kv get testkey) == 42 ]")
152 def rolling_restart_test(proper_rolling_procedure=True):
154 Tests that the cluster can tolearate failures of any single server,
155 following the recommended rolling upgrade procedure from
156 https://www.consul.io/docs/upgrading#standard-upgrades.
158 Optionally, `proper_rolling_procedure=False` can be given
159 to wait only for each server to be back `Healthy`, not `Stable`
160 in the Raft consensus, see Consul setting `ServerStabilizationTime` and
161 https://github.com/hashicorp/consul/issues/8118#issuecomment-645330040.
164 for server in servers:
166 server.systemctl("stop consul")
168 # Make sure the stopped peer is recognized as being down
169 client1.wait_until_succeeds(
170 f"[ $(consul members | grep {server.name} | grep -o -E 'failed|left' | wc -l) == 1 ]"
173 # For each client, wait until they have connection again
174 # using `kv get -recurse` before issuing commands.
175 client1.wait_until_succeeds("consul kv get -recurse")
176 client2.wait_until_succeeds("consul kv get -recurse")
178 # Do some consul actions while one server is down.
179 client1.succeed("consul kv put testkey 43")
180 client2.succeed("[ $(consul kv get testkey) == 43 ]")
181 client2.succeed("consul kv delete testkey")
184 server.systemctl("start consul")
186 if proper_rolling_procedure:
188 wait_for_healthy_servers()
190 # NOT proper rolling upgrade procedure, see above.
191 wait_for_all_machines_alive()
193 # Wait for client connections.
194 client1.wait_until_succeeds("consul kv get -recurse")
195 client2.wait_until_succeeds("consul kv get -recurse")
197 # Do some consul actions with server back up.
198 client1.succeed("consul kv put testkey 44")
199 client2.succeed("[ $(consul kv get testkey) == 44 ]")
200 client2.succeed("consul kv delete testkey")
203 def all_servers_crash_simultaneously_test():
205 Tests that the cluster will eventually come back after all
206 servers crash simultaneously.
209 for server in servers:
211 server.systemctl("stop --no-block consul")
213 for server in servers:
214 # --no-block is async, so ensure it has been stopped by now
215 server.wait_until_fails("systemctl is-active --quiet consul")
217 server.systemctl("start consul")
220 wait_for_healthy_servers()
222 # Wait for client connections.
223 client1.wait_until_succeeds("consul kv get -recurse")
224 client2.wait_until_succeeds("consul kv get -recurse")
226 # Do some consul actions with servers back up.
227 client1.succeed("consul kv put testkey 44")
228 client2.succeed("[ $(consul kv get testkey) == 44 ]")
229 client2.succeed("consul kv delete testkey")
234 print("rolling_restart_test()")
235 rolling_restart_test()
237 print("all_servers_crash_simultaneously_test()")
238 all_servers_crash_simultaneously_test()
240 print("rolling_restart_test(proper_rolling_procedure=False)")
241 rolling_restart_test(proper_rolling_procedure=False)