python3Packages.orjson: Disable failing tests on 32 bit
[NixPkgs.git] / nixos / tests / consul.nix
blobee85f1d0b917aa4b098b30e22585561815b5dca6
1 import ./make-test-python.nix ({pkgs, lib, ...}:
3 let
4   # Settings for both servers and agents
5   webUi = true;
6   retry_interval = "1s";
7   raft_multiplier = 1;
9   defaultExtraConfig = {
10     inherit retry_interval;
11     performance = {
12       inherit raft_multiplier;
13     };
14   };
16   allConsensusServerHosts = [
17     "192.168.1.1"
18     "192.168.1.2"
19     "192.168.1.3"
20   ];
22   allConsensusClientHosts = [
23     "192.168.2.1"
24     "192.168.2.2"
25   ];
27   firewallSettings = {
28     # See https://www.consul.io/docs/install/ports.html
29     allowedTCPPorts = [ 8301 8302 8600 8500 8300 ];
30     allowedUDPPorts = [ 8301 8302 8600 ];
31   };
33   client = index: { pkgs, ... }:
34     let
35       ip = builtins.elemAt allConsensusClientHosts index;
36     in
37       {
38         environment.systemPackages = [ pkgs.consul ];
40         networking.interfaces.eth1.ipv4.addresses = pkgs.lib.mkOverride 0 [
41           { address = ip; prefixLength = 16; }
42         ];
43         networking.firewall = firewallSettings;
45         services.consul = {
46           enable = true;
47           inherit webUi;
48           extraConfig = defaultExtraConfig // {
49             server = false;
50             retry_join = allConsensusServerHosts;
51             bind_addr = ip;
52           };
53         };
54       };
56   server = index: { pkgs, ... }:
57     let
58       numConsensusServers = builtins.length allConsensusServerHosts;
59       thisConsensusServerHost = builtins.elemAt allConsensusServerHosts index;
60       ip = thisConsensusServerHost; # since we already use IPs to identify servers
61     in
62       {
63         networking.interfaces.eth1.ipv4.addresses = pkgs.lib.mkOverride 0 [
64           { address = ip; prefixLength = 16; }
65         ];
66         networking.firewall = firewallSettings;
68         services.consul =
69           assert builtins.elem thisConsensusServerHost allConsensusServerHosts;
70           {
71             enable = true;
72             inherit webUi;
73             extraConfig = defaultExtraConfig // {
74               server = true;
75               bootstrap_expect = numConsensusServers;
76               # Tell Consul that we never intend to drop below this many servers.
77               # Ensures to not permanently lose consensus after temporary loss.
78               # See https://github.com/hashicorp/consul/issues/8118#issuecomment-645330040
79               autopilot.min_quorum = numConsensusServers;
80               retry_join =
81                 # If there's only 1 node in the network, we allow self-join;
82                 # otherwise, the node must not try to join itself, and join only the other servers.
83                 # See https://github.com/hashicorp/consul/issues/2868
84                 if numConsensusServers == 1
85                   then allConsensusServerHosts
86                   else builtins.filter (h: h != thisConsensusServerHost) allConsensusServerHosts;
87               bind_addr = ip;
88             };
89           };
90       };
91 in {
92   name = "consul";
94   nodes = {
95     server1 = server 0;
96     server2 = server 1;
97     server3 = server 2;
99     client1 = client 0;
100     client2 = client 1;
101   };
103   testScript = ''
104     servers = [server1, server2, server3]
105     machines = [server1, server2, server3, client1, client2]
107     for m in machines:
108         m.wait_for_unit("consul.service")
111     def wait_for_healthy_servers():
112         # See https://github.com/hashicorp/consul/issues/8118#issuecomment-645330040
113         # for why the `Voter` column of `list-peers` has that info.
114         # TODO: The `grep true` relies on the fact that currently in
115         #       the output like
116         #           # consul operator raft list-peers
117         #           Node     ID   Address           State     Voter  RaftProtocol
118         #           server3  ...  192.168.1.3:8300  leader    true   3
119         #           server2  ...  192.168.1.2:8300  follower  true   3
120         #           server1  ...  192.168.1.1:8300  follower  false  3
121         #       `Voter`is the only boolean column.
122         #       Change this to the more reliable way to be defined by
123         #       https://github.com/hashicorp/consul/issues/8118
124         #       once that ticket is closed.
125         for m in machines:
126             m.wait_until_succeeds(
127                 "[ $(consul operator raft list-peers | grep true | wc -l) == 3 ]"
128             )
131     def wait_for_all_machines_alive():
132         """
133         Note that Serf-"alive" does not mean "Raft"-healthy;
134         see `wait_for_healthy_servers()` for that instead.
135         """
136         for m in machines:
137             m.wait_until_succeeds("[ $(consul members | grep -o alive | wc -l) == 5 ]")
140     wait_for_healthy_servers()
141     # Also wait for clients to be alive.
142     wait_for_all_machines_alive()
144     client1.succeed("consul kv put testkey 42")
145     client2.succeed("[ $(consul kv get testkey) == 42 ]")
148     def rolling_reboot_test(proper_rolling_procedure=True):
149         """
150         Tests that the cluster can tolearate failures of any single server,
151         following the recommended rolling upgrade procedure from
152         https://www.consul.io/docs/upgrading#standard-upgrades.
154         Optionally, `proper_rolling_procedure=False` can be given
155         to wait only for each server to be back `Healthy`, not `Stable`
156         in the Raft consensus, see Consul setting `ServerStabilizationTime` and
157         https://github.com/hashicorp/consul/issues/8118#issuecomment-645330040.
158         """
160         for server in servers:
161             server.crash()
163             # For each client, wait until they have connection again
164             # using `kv get -recurse` before issuing commands.
165             client1.wait_until_succeeds("consul kv get -recurse")
166             client2.wait_until_succeeds("consul kv get -recurse")
168             # Do some consul actions while one server is down.
169             client1.succeed("consul kv put testkey 43")
170             client2.succeed("[ $(consul kv get testkey) == 43 ]")
171             client2.succeed("consul kv delete testkey")
173             # Restart crashed machine.
174             server.start()
176             if proper_rolling_procedure:
177                 # Wait for recovery.
178                 wait_for_healthy_servers()
179             else:
180                 # NOT proper rolling upgrade procedure, see above.
181                 wait_for_all_machines_alive()
183             # Wait for client connections.
184             client1.wait_until_succeeds("consul kv get -recurse")
185             client2.wait_until_succeeds("consul kv get -recurse")
187             # Do some consul actions with server back up.
188             client1.succeed("consul kv put testkey 44")
189             client2.succeed("[ $(consul kv get testkey) == 44 ]")
190             client2.succeed("consul kv delete testkey")
193     def all_servers_crash_simultaneously_test():
194         """
195         Tests that the cluster will eventually come back after all
196         servers crash simultaneously.
197         """
199         for server in servers:
200             server.crash()
202         for server in servers:
203             server.start()
205         # Wait for recovery.
206         wait_for_healthy_servers()
208         # Wait for client connections.
209         client1.wait_until_succeeds("consul kv get -recurse")
210         client2.wait_until_succeeds("consul kv get -recurse")
212         # Do some consul actions with servers back up.
213         client1.succeed("consul kv put testkey 44")
214         client2.succeed("[ $(consul kv get testkey) == 44 ]")
215         client2.succeed("consul kv delete testkey")
218     # Run the tests.
220     print("rolling_reboot_test()")
221     rolling_reboot_test()
223     print("all_servers_crash_simultaneously_test()")
224     all_servers_crash_simultaneously_test()
226     print("rolling_reboot_test(proper_rolling_procedure=False)")
227     rolling_reboot_test(proper_rolling_procedure=False)
228   '';