vuls: init at 0.27.0
[NixPkgs.git] / nixos / tests / consul.nix
blobc819312068dc754c3beb123eb19b54a04292b384
1 import ./make-test-python.nix ({pkgs, lib, ...}:
3 let
4   # Settings for both servers and agents
5   webUi = true;
6   retry_interval = "1s";
7   raft_multiplier = 1;
9   defaultExtraConfig = {
10     inherit retry_interval;
11     performance = {
12       inherit raft_multiplier;
13     };
14   };
16   allConsensusServerHosts = [
17     "192.168.1.1"
18     "192.168.1.2"
19     "192.168.1.3"
20   ];
22   allConsensusClientHosts = [
23     "192.168.2.1"
24     "192.168.2.2"
25   ];
27   firewallSettings = {
28     # See https://www.consul.io/docs/install/ports.html
29     allowedTCPPorts = [ 8301 8302 8600 8500 8300 ];
30     allowedUDPPorts = [ 8301 8302 8600 ];
31   };
33   client = index: { pkgs, ... }:
34     let
35       ip = builtins.elemAt allConsensusClientHosts index;
36     in
37       {
38         environment.systemPackages = [ pkgs.consul ];
40         networking.interfaces.eth1.ipv4.addresses = pkgs.lib.mkOverride 0 [
41           { address = ip; prefixLength = 16; }
42         ];
43         networking.firewall = firewallSettings;
45         nixpkgs.config.allowUnfreePredicate = pkg: builtins.elem (lib.getName pkg) [ "consul" ];
47         services.consul = {
48           enable = true;
49           inherit webUi;
50           extraConfig = defaultExtraConfig // {
51             server = false;
52             retry_join = allConsensusServerHosts;
53             bind_addr = ip;
54           };
55         };
56       };
58   server = index: { pkgs, ... }:
59     let
60       numConsensusServers = builtins.length allConsensusServerHosts;
61       thisConsensusServerHost = builtins.elemAt allConsensusServerHosts index;
62       ip = thisConsensusServerHost; # since we already use IPs to identify servers
63     in
64       {
65         networking.interfaces.eth1.ipv4.addresses = pkgs.lib.mkOverride 0 [
66           { address = ip; prefixLength = 16; }
67         ];
68         networking.firewall = firewallSettings;
70         nixpkgs.config.allowUnfreePredicate = pkg: builtins.elem (lib.getName pkg) [ "consul" ];
72         services.consul =
73           assert builtins.elem thisConsensusServerHost allConsensusServerHosts;
74           {
75             enable = true;
76             inherit webUi;
77             extraConfig = defaultExtraConfig // {
78               server = true;
79               bootstrap_expect = numConsensusServers;
80               # Tell Consul that we never intend to drop below this many servers.
81               # Ensures to not permanently lose consensus after temporary loss.
82               # See https://github.com/hashicorp/consul/issues/8118#issuecomment-645330040
83               autopilot.min_quorum = numConsensusServers;
84               retry_join =
85                 # If there's only 1 node in the network, we allow self-join;
86                 # otherwise, the node must not try to join itself, and join only the other servers.
87                 # See https://github.com/hashicorp/consul/issues/2868
88                 if numConsensusServers == 1
89                   then allConsensusServerHosts
90                   else builtins.filter (h: h != thisConsensusServerHost) allConsensusServerHosts;
91               bind_addr = ip;
92             };
93           };
94       };
95 in {
96   name = "consul";
98   nodes = {
99     server1 = server 0;
100     server2 = server 1;
101     server3 = server 2;
103     client1 = client 0;
104     client2 = client 1;
105   };
107   testScript = ''
108     servers = [server1, server2, server3]
109     machines = [server1, server2, server3, client1, client2]
111     for m in machines:
112         m.wait_for_unit("consul.service")
115     def wait_for_healthy_servers():
116         # See https://github.com/hashicorp/consul/issues/8118#issuecomment-645330040
117         # for why the `Voter` column of `list-peers` has that info.
118         # TODO: The `grep true` relies on the fact that currently in
119         #       the output like
120         #           # consul operator raft list-peers
121         #           Node     ID   Address           State     Voter  RaftProtocol
122         #           server3  ...  192.168.1.3:8300  leader    true   3
123         #           server2  ...  192.168.1.2:8300  follower  true   3
124         #           server1  ...  192.168.1.1:8300  follower  false  3
125         #       `Voter`is the only boolean column.
126         #       Change this to the more reliable way to be defined by
127         #       https://github.com/hashicorp/consul/issues/8118
128         #       once that ticket is closed.
129         for m in machines:
130             m.wait_until_succeeds(
131                 "[ $(consul operator raft list-peers | grep true | wc -l) == 3 ]"
132             )
135     def wait_for_all_machines_alive():
136         """
137         Note that Serf-"alive" does not mean "Raft"-healthy;
138         see `wait_for_healthy_servers()` for that instead.
139         """
140         for m in machines:
141             m.wait_until_succeeds("[ $(consul members | grep -o alive | wc -l) == 5 ]")
144     wait_for_healthy_servers()
145     # Also wait for clients to be alive.
146     wait_for_all_machines_alive()
148     client1.succeed("consul kv put testkey 42")
149     client2.succeed("[ $(consul kv get testkey) == 42 ]")
152     def rolling_restart_test(proper_rolling_procedure=True):
153         """
154         Tests that the cluster can tolearate failures of any single server,
155         following the recommended rolling upgrade procedure from
156         https://www.consul.io/docs/upgrading#standard-upgrades.
158         Optionally, `proper_rolling_procedure=False` can be given
159         to wait only for each server to be back `Healthy`, not `Stable`
160         in the Raft consensus, see Consul setting `ServerStabilizationTime` and
161         https://github.com/hashicorp/consul/issues/8118#issuecomment-645330040.
162         """
164         for server in servers:
165             server.block()
166             server.systemctl("stop consul")
168             # Make sure the stopped peer is recognized as being down
169             client1.wait_until_succeeds(
170               f"[ $(consul members | grep {server.name} | grep -o -E 'failed|left' | wc -l) == 1 ]"
171             )
173             # For each client, wait until they have connection again
174             # using `kv get -recurse` before issuing commands.
175             client1.wait_until_succeeds("consul kv get -recurse")
176             client2.wait_until_succeeds("consul kv get -recurse")
178             # Do some consul actions while one server is down.
179             client1.succeed("consul kv put testkey 43")
180             client2.succeed("[ $(consul kv get testkey) == 43 ]")
181             client2.succeed("consul kv delete testkey")
183             server.unblock()
184             server.systemctl("start consul")
186             if proper_rolling_procedure:
187                 # Wait for recovery.
188                 wait_for_healthy_servers()
189             else:
190                 # NOT proper rolling upgrade procedure, see above.
191                 wait_for_all_machines_alive()
193             # Wait for client connections.
194             client1.wait_until_succeeds("consul kv get -recurse")
195             client2.wait_until_succeeds("consul kv get -recurse")
197             # Do some consul actions with server back up.
198             client1.succeed("consul kv put testkey 44")
199             client2.succeed("[ $(consul kv get testkey) == 44 ]")
200             client2.succeed("consul kv delete testkey")
203     def all_servers_crash_simultaneously_test():
204         """
205         Tests that the cluster will eventually come back after all
206         servers crash simultaneously.
207         """
209         for server in servers:
210             server.block()
211             server.systemctl("stop --no-block consul")
213         for server in servers:
214             # --no-block is async, so ensure it has been stopped by now
215             server.wait_until_fails("systemctl is-active --quiet consul")
216             server.unblock()
217             server.systemctl("start consul")
219         # Wait for recovery.
220         wait_for_healthy_servers()
222         # Wait for client connections.
223         client1.wait_until_succeeds("consul kv get -recurse")
224         client2.wait_until_succeeds("consul kv get -recurse")
226         # Do some consul actions with servers back up.
227         client1.succeed("consul kv put testkey 44")
228         client2.succeed("[ $(consul kv get testkey) == 44 ]")
229         client2.succeed("consul kv delete testkey")
232     # Run the tests.
234     print("rolling_restart_test()")
235     rolling_restart_test()
237     print("all_servers_crash_simultaneously_test()")
238     all_servers_crash_simultaneously_test()
240     print("rolling_restart_test(proper_rolling_procedure=False)")
241     rolling_restart_test(proper_rolling_procedure=False)
242   '';