1 import ./make-test-python.nix (
5 # Settings for both servers and agents
10 defaultExtraConfig = {
11 inherit retry_interval;
13 inherit raft_multiplier;
17 allConsensusServerHosts = [
23 allConsensusClientHosts = [
29 # See https://www.consul.io/docs/install/ports.html
48 ip = builtins.elemAt allConsensusClientHosts index;
51 environment.systemPackages = [ pkgs.consul ];
53 networking.interfaces.eth1.ipv4.addresses = pkgs.lib.mkOverride 0 [
59 networking.firewall = firewallSettings;
61 nixpkgs.config.allowUnfreePredicate = pkg: builtins.elem (lib.getName pkg) [ "consul" ];
66 extraConfig = defaultExtraConfig // {
68 retry_join = allConsensusServerHosts;
78 numConsensusServers = builtins.length allConsensusServerHosts;
79 thisConsensusServerHost = builtins.elemAt allConsensusServerHosts index;
80 ip = thisConsensusServerHost; # since we already use IPs to identify servers
83 networking.interfaces.eth1.ipv4.addresses = pkgs.lib.mkOverride 0 [
89 networking.firewall = firewallSettings;
91 nixpkgs.config.allowUnfreePredicate = pkg: builtins.elem (lib.getName pkg) [ "consul" ];
94 assert builtins.elem thisConsensusServerHost allConsensusServerHosts;
98 extraConfig = defaultExtraConfig // {
100 bootstrap_expect = numConsensusServers;
101 # Tell Consul that we never intend to drop below this many servers.
102 # Ensures to not permanently lose consensus after temporary loss.
103 # See https://github.com/hashicorp/consul/issues/8118#issuecomment-645330040
104 autopilot.min_quorum = numConsensusServers;
106 # If there's only 1 node in the network, we allow self-join;
107 # otherwise, the node must not try to join itself, and join only the other servers.
108 # See https://github.com/hashicorp/consul/issues/2868
109 if numConsensusServers == 1 then
110 allConsensusServerHosts
112 builtins.filter (h: h != thisConsensusServerHost) allConsensusServerHosts;
131 servers = [server1, server2, server3]
132 machines = [server1, server2, server3, client1, client2]
135 m.wait_for_unit("consul.service")
138 def wait_for_healthy_servers():
139 # See https://github.com/hashicorp/consul/issues/8118#issuecomment-645330040
140 # for why the `Voter` column of `list-peers` has that info.
141 # TODO: The `grep true` relies on the fact that currently in
143 # # consul operator raft list-peers
144 # Node ID Address State Voter RaftProtocol
145 # server3 ... 192.168.1.3:8300 leader true 3
146 # server2 ... 192.168.1.2:8300 follower true 3
147 # server1 ... 192.168.1.1:8300 follower false 3
148 # `Voter`is the only boolean column.
149 # Change this to the more reliable way to be defined by
150 # https://github.com/hashicorp/consul/issues/8118
151 # once that ticket is closed.
153 m.wait_until_succeeds(
154 "[ $(consul operator raft list-peers | grep true | wc -l) == 3 ]"
158 def wait_for_all_machines_alive():
160 Note that Serf-"alive" does not mean "Raft"-healthy;
161 see `wait_for_healthy_servers()` for that instead.
164 m.wait_until_succeeds("[ $(consul members | grep -o alive | wc -l) == 5 ]")
167 wait_for_healthy_servers()
168 # Also wait for clients to be alive.
169 wait_for_all_machines_alive()
171 client1.succeed("consul kv put testkey 42")
172 client2.succeed("[ $(consul kv get testkey) == 42 ]")
175 def rolling_restart_test(proper_rolling_procedure=True):
177 Tests that the cluster can tolearate failures of any single server,
178 following the recommended rolling upgrade procedure from
179 https://www.consul.io/docs/upgrading#standard-upgrades.
181 Optionally, `proper_rolling_procedure=False` can be given
182 to wait only for each server to be back `Healthy`, not `Stable`
183 in the Raft consensus, see Consul setting `ServerStabilizationTime` and
184 https://github.com/hashicorp/consul/issues/8118#issuecomment-645330040.
187 for server in servers:
189 server.systemctl("stop consul")
191 # Make sure the stopped peer is recognized as being down
192 client1.wait_until_succeeds(
193 f"[ $(consul members | grep {server.name} | grep -o -E 'failed|left' | wc -l) == 1 ]"
196 # For each client, wait until they have connection again
197 # using `kv get -recurse` before issuing commands.
198 client1.wait_until_succeeds("consul kv get -recurse")
199 client2.wait_until_succeeds("consul kv get -recurse")
201 # Do some consul actions while one server is down.
202 client1.succeed("consul kv put testkey 43")
203 client2.succeed("[ $(consul kv get testkey) == 43 ]")
204 client2.succeed("consul kv delete testkey")
207 server.systemctl("start consul")
209 if proper_rolling_procedure:
211 wait_for_healthy_servers()
213 # NOT proper rolling upgrade procedure, see above.
214 wait_for_all_machines_alive()
216 # Wait for client connections.
217 client1.wait_until_succeeds("consul kv get -recurse")
218 client2.wait_until_succeeds("consul kv get -recurse")
220 # Do some consul actions with server back up.
221 client1.succeed("consul kv put testkey 44")
222 client2.succeed("[ $(consul kv get testkey) == 44 ]")
223 client2.succeed("consul kv delete testkey")
226 def all_servers_crash_simultaneously_test():
228 Tests that the cluster will eventually come back after all
229 servers crash simultaneously.
232 for server in servers:
234 server.systemctl("stop --no-block consul")
236 for server in servers:
237 # --no-block is async, so ensure it has been stopped by now
238 server.wait_until_fails("systemctl is-active --quiet consul")
240 server.systemctl("start consul")
243 wait_for_healthy_servers()
245 # Wait for client connections.
246 client1.wait_until_succeeds("consul kv get -recurse")
247 client2.wait_until_succeeds("consul kv get -recurse")
249 # Do some consul actions with servers back up.
250 client1.succeed("consul kv put testkey 44")
251 client2.succeed("[ $(consul kv get testkey) == 44 ]")
252 client2.succeed("consul kv delete testkey")
257 print("rolling_restart_test()")
258 rolling_restart_test()
260 print("all_servers_crash_simultaneously_test()")
261 all_servers_crash_simultaneously_test()
263 print("rolling_restart_test(proper_rolling_procedure=False)")
264 rolling_restart_test(proper_rolling_procedure=False)