ttaenc: init at 3.4.1 (#238757)
[NixPkgs.git] / nixos / tests / consul.nix
blob253d70f13b598862cc8b3ea949f47955e287521f
1 import ./make-test-python.nix (
2   { pkgs, lib, ... }:
4   let
5     # Settings for both servers and agents
6     webUi = true;
7     retry_interval = "1s";
8     raft_multiplier = 1;
10     defaultExtraConfig = {
11       inherit retry_interval;
12       performance = {
13         inherit raft_multiplier;
14       };
15     };
17     allConsensusServerHosts = [
18       "192.168.1.1"
19       "192.168.1.2"
20       "192.168.1.3"
21     ];
23     allConsensusClientHosts = [
24       "192.168.2.1"
25       "192.168.2.2"
26     ];
28     firewallSettings = {
29       # See https://www.consul.io/docs/install/ports.html
30       allowedTCPPorts = [
31         8301
32         8302
33         8600
34         8500
35         8300
36       ];
37       allowedUDPPorts = [
38         8301
39         8302
40         8600
41       ];
42     };
44     client =
45       index:
46       { pkgs, ... }:
47       let
48         ip = builtins.elemAt allConsensusClientHosts index;
49       in
50       {
51         environment.systemPackages = [ pkgs.consul ];
53         networking.interfaces.eth1.ipv4.addresses = pkgs.lib.mkOverride 0 [
54           {
55             address = ip;
56             prefixLength = 16;
57           }
58         ];
59         networking.firewall = firewallSettings;
61         nixpkgs.config.allowUnfreePredicate = pkg: builtins.elem (lib.getName pkg) [ "consul" ];
63         services.consul = {
64           enable = true;
65           inherit webUi;
66           extraConfig = defaultExtraConfig // {
67             server = false;
68             retry_join = allConsensusServerHosts;
69             bind_addr = ip;
70           };
71         };
72       };
74     server =
75       index:
76       { pkgs, ... }:
77       let
78         numConsensusServers = builtins.length allConsensusServerHosts;
79         thisConsensusServerHost = builtins.elemAt allConsensusServerHosts index;
80         ip = thisConsensusServerHost; # since we already use IPs to identify servers
81       in
82       {
83         networking.interfaces.eth1.ipv4.addresses = pkgs.lib.mkOverride 0 [
84           {
85             address = ip;
86             prefixLength = 16;
87           }
88         ];
89         networking.firewall = firewallSettings;
91         nixpkgs.config.allowUnfreePredicate = pkg: builtins.elem (lib.getName pkg) [ "consul" ];
93         services.consul =
94           assert builtins.elem thisConsensusServerHost allConsensusServerHosts;
95           {
96             enable = true;
97             inherit webUi;
98             extraConfig = defaultExtraConfig // {
99               server = true;
100               bootstrap_expect = numConsensusServers;
101               # Tell Consul that we never intend to drop below this many servers.
102               # Ensures to not permanently lose consensus after temporary loss.
103               # See https://github.com/hashicorp/consul/issues/8118#issuecomment-645330040
104               autopilot.min_quorum = numConsensusServers;
105               retry_join =
106                 # If there's only 1 node in the network, we allow self-join;
107                 # otherwise, the node must not try to join itself, and join only the other servers.
108                 # See https://github.com/hashicorp/consul/issues/2868
109                 if numConsensusServers == 1 then
110                   allConsensusServerHosts
111                 else
112                   builtins.filter (h: h != thisConsensusServerHost) allConsensusServerHosts;
113               bind_addr = ip;
114             };
115           };
116       };
117   in
118   {
119     name = "consul";
121     nodes = {
122       server1 = server 0;
123       server2 = server 1;
124       server3 = server 2;
126       client1 = client 0;
127       client2 = client 1;
128     };
130     testScript = ''
131       servers = [server1, server2, server3]
132       machines = [server1, server2, server3, client1, client2]
134       for m in machines:
135           m.wait_for_unit("consul.service")
138       def wait_for_healthy_servers():
139           # See https://github.com/hashicorp/consul/issues/8118#issuecomment-645330040
140           # for why the `Voter` column of `list-peers` has that info.
141           # TODO: The `grep true` relies on the fact that currently in
142           #       the output like
143           #           # consul operator raft list-peers
144           #           Node     ID   Address           State     Voter  RaftProtocol
145           #           server3  ...  192.168.1.3:8300  leader    true   3
146           #           server2  ...  192.168.1.2:8300  follower  true   3
147           #           server1  ...  192.168.1.1:8300  follower  false  3
148           #       `Voter`is the only boolean column.
149           #       Change this to the more reliable way to be defined by
150           #       https://github.com/hashicorp/consul/issues/8118
151           #       once that ticket is closed.
152           for m in machines:
153               m.wait_until_succeeds(
154                   "[ $(consul operator raft list-peers | grep true | wc -l) == 3 ]"
155               )
158       def wait_for_all_machines_alive():
159           """
160           Note that Serf-"alive" does not mean "Raft"-healthy;
161           see `wait_for_healthy_servers()` for that instead.
162           """
163           for m in machines:
164               m.wait_until_succeeds("[ $(consul members | grep -o alive | wc -l) == 5 ]")
167       wait_for_healthy_servers()
168       # Also wait for clients to be alive.
169       wait_for_all_machines_alive()
171       client1.succeed("consul kv put testkey 42")
172       client2.succeed("[ $(consul kv get testkey) == 42 ]")
175       def rolling_restart_test(proper_rolling_procedure=True):
176           """
177           Tests that the cluster can tolearate failures of any single server,
178           following the recommended rolling upgrade procedure from
179           https://www.consul.io/docs/upgrading#standard-upgrades.
181           Optionally, `proper_rolling_procedure=False` can be given
182           to wait only for each server to be back `Healthy`, not `Stable`
183           in the Raft consensus, see Consul setting `ServerStabilizationTime` and
184           https://github.com/hashicorp/consul/issues/8118#issuecomment-645330040.
185           """
187           for server in servers:
188               server.block()
189               server.systemctl("stop consul")
191               # Make sure the stopped peer is recognized as being down
192               client1.wait_until_succeeds(
193                 f"[ $(consul members | grep {server.name} | grep -o -E 'failed|left' | wc -l) == 1 ]"
194               )
196               # For each client, wait until they have connection again
197               # using `kv get -recurse` before issuing commands.
198               client1.wait_until_succeeds("consul kv get -recurse")
199               client2.wait_until_succeeds("consul kv get -recurse")
201               # Do some consul actions while one server is down.
202               client1.succeed("consul kv put testkey 43")
203               client2.succeed("[ $(consul kv get testkey) == 43 ]")
204               client2.succeed("consul kv delete testkey")
206               server.unblock()
207               server.systemctl("start consul")
209               if proper_rolling_procedure:
210                   # Wait for recovery.
211                   wait_for_healthy_servers()
212               else:
213                   # NOT proper rolling upgrade procedure, see above.
214                   wait_for_all_machines_alive()
216               # Wait for client connections.
217               client1.wait_until_succeeds("consul kv get -recurse")
218               client2.wait_until_succeeds("consul kv get -recurse")
220               # Do some consul actions with server back up.
221               client1.succeed("consul kv put testkey 44")
222               client2.succeed("[ $(consul kv get testkey) == 44 ]")
223               client2.succeed("consul kv delete testkey")
226       def all_servers_crash_simultaneously_test():
227           """
228           Tests that the cluster will eventually come back after all
229           servers crash simultaneously.
230           """
232           for server in servers:
233               server.block()
234               server.systemctl("stop --no-block consul")
236           for server in servers:
237               # --no-block is async, so ensure it has been stopped by now
238               server.wait_until_fails("systemctl is-active --quiet consul")
239               server.unblock()
240               server.systemctl("start consul")
242           # Wait for recovery.
243           wait_for_healthy_servers()
245           # Wait for client connections.
246           client1.wait_until_succeeds("consul kv get -recurse")
247           client2.wait_until_succeeds("consul kv get -recurse")
249           # Do some consul actions with servers back up.
250           client1.succeed("consul kv put testkey 44")
251           client2.succeed("[ $(consul kv get testkey) == 44 ]")
252           client2.succeed("consul kv delete testkey")
255       # Run the tests.
257       print("rolling_restart_test()")
258       rolling_restart_test()
260       print("all_servers_crash_simultaneously_test()")
261       all_servers_crash_simultaneously_test()
263       print("rolling_restart_test(proper_rolling_procedure=False)")
264       rolling_restart_test(proper_rolling_procedure=False)
265     '';
266   }