Release NixOS 23.11
[NixPkgs.git] / nixos / tests / prometheus.nix
blob01112738937752e0d0c9254643f74fdc84472e05
1 let
2   grpcPort   = 19090;
3   queryPort  =  9090;
4   minioPort  =  9000;
5   pushgwPort =  9091;
6   frontPort  =  9092;
8   s3 = {
9     accessKey = "BKIKJAA5BMMU2RHO6IBB";
10     secretKey = "V7f1CwQqAcwo80UEIJEjc5gVQUSSx5ohQ9GSrr12";
11   };
13   objstore.config = {
14     type = "S3";
15     config = {
16       bucket = "thanos-bucket";
17       endpoint = "s3:${toString minioPort}";
18       region =  "us-east-1";
19       access_key = s3.accessKey;
20       secret_key = s3.secretKey;
21       insecure = true;
22       signature_version2 = false;
23       put_user_metadata = {};
24       http_config = {
25         idle_conn_timeout = "0s";
26         insecure_skip_verify = false;
27       };
28       trace = {
29         enable = false;
30       };
31     };
32   };
34 in import ./make-test-python.nix {
35   name = "prometheus";
37   nodes = {
38     prometheus = { pkgs, ... }: {
39       virtualisation.diskSize = 2 * 1024;
40       virtualisation.memorySize = 2048;
41       environment.systemPackages = [ pkgs.jq ];
42       networking.firewall.allowedTCPPorts = [ grpcPort ];
43       services.prometheus = {
44         enable = true;
45         enableReload = true;
46         scrapeConfigs = [
47           {
48             job_name = "prometheus";
49             static_configs = [
50               {
51                 targets = [ "127.0.0.1:${toString queryPort}" ];
52                 labels = { instance = "localhost"; };
53               }
54             ];
55           }
56           {
57             job_name = "pushgateway";
58             scrape_interval = "1s";
59             static_configs = [
60               {
61                 targets = [ "127.0.0.1:${toString pushgwPort}" ];
62               }
63             ];
64           }
65         ];
66         rules = [
67           ''
68             groups:
69               - name: test
70                 rules:
71                   - record: testrule
72                     expr: count(up{job="prometheus"})
73           ''
74         ];
75         globalConfig = {
76           external_labels = {
77             some_label = "required by thanos";
78           };
79         };
80         extraFlags = [
81           # Required by thanos
82           "--storage.tsdb.min-block-duration=5s"
83           "--storage.tsdb.max-block-duration=5s"
84         ];
85       };
86       services.prometheus.pushgateway = {
87         enable = true;
88         web.listen-address = ":${toString pushgwPort}";
89         persistMetrics = true;
90         persistence.interval = "1s";
91         stateDir = "prometheus-pushgateway";
92       };
93       services.thanos = {
94         sidecar = {
95           enable = true;
96           grpc-address = "0.0.0.0:${toString grpcPort}";
97           inherit objstore;
98         };
100         # TODO: Add some tests for these services:
101         #rule = {
102         #  enable = true;
103         #  http-address = "0.0.0.0:19194";
104         #  grpc-address = "0.0.0.0:19193";
105         #  query.addresses = [
106         #    "localhost:19191"
107         #  ];
108         #  labels = {
109         #    just = "some";
110         #    nice = "labels";
111         #  };
112         #};
113         #
114         #receive = {
115         #  http-address = "0.0.0.0:19195";
116         #  enable = true;
117         #  labels = {
118         #    just = "some";
119         #    nice = "labels";
120         #  };
121         #};
122       };
123       # Adds a "specialisation" of the above config which allows us to
124       # "switch" to it and see if the services.prometheus.enableReload
125       # functionality actually reloads the prometheus service instead of
126       # restarting it.
127       specialisation = {
128         "prometheus-config-change" = {
129           configuration = {
130             environment.systemPackages = [ pkgs.yq ];
132             # This configuration just adds a new prometheus job
133             # to scrape the node_exporter metrics of the s3 machine.
134             services.prometheus = {
135               scrapeConfigs = [
136                 {
137                   job_name = "s3-node_exporter";
138                   static_configs = [
139                     {
140                       targets = [ "s3:9100" ];
141                     }
142                   ];
143                 }
144               ];
145             };
146           };
147         };
148       };
149     };
151     query = { pkgs, ... }: {
152       environment.systemPackages = [ pkgs.jq ];
153       services.thanos.query = {
154         enable = true;
155         http-address = "0.0.0.0:${toString queryPort}";
156         endpoints = [
157           "prometheus:${toString grpcPort}"
158         ];
159       };
160       services.thanos.query-frontend = {
161         enable = true;
162         http-address = "0.0.0.0:${toString frontPort}";
163         query-frontend.downstream-url = "http://127.0.0.1:${toString queryPort}";
164       };
165     };
167     store = { pkgs, ... }: {
168       virtualisation.diskSize = 2 * 1024;
169       virtualisation.memorySize = 2048;
170       environment.systemPackages = with pkgs; [ jq thanos ];
171       services.thanos.store = {
172         enable = true;
173         http-address = "0.0.0.0:10902";
174         grpc-address = "0.0.0.0:${toString grpcPort}";
175         inherit objstore;
176         sync-block-duration = "1s";
177       };
178       services.thanos.compact = {
179         enable = true;
180         http-address = "0.0.0.0:10903";
181         inherit objstore;
182         consistency-delay = "5s";
183       };
184       services.thanos.query = {
185         enable = true;
186         http-address = "0.0.0.0:${toString queryPort}";
187         endpoints = [
188           "localhost:${toString grpcPort}"
189         ];
190       };
191     };
193     s3 = { pkgs, ... } : {
194       # Minio requires at least 1GiB of free disk space to run.
195       virtualisation = {
196         diskSize = 2 * 1024;
197       };
198       networking.firewall.allowedTCPPorts = [ minioPort ];
200       services.minio = {
201         enable = true;
202         inherit (s3) accessKey secretKey;
203       };
205       environment.systemPackages = [ pkgs.minio-client ];
207       services.prometheus.exporters.node = {
208         enable = true;
209         openFirewall = true;
210       };
211     };
212   };
214   testScript = { nodes, ... } : ''
215     import json
217     # Before starting the other machines we first make sure that our S3 service is online
218     # and has a bucket added for thanos:
219     s3.start()
220     s3.wait_for_unit("minio.service")
221     s3.wait_for_open_port(${toString minioPort})
222     s3.succeed(
223         "mc config host add minio "
224         + "http://localhost:${toString minioPort} "
225         + "${s3.accessKey} ${s3.secretKey} --api s3v4",
226         "mc mb minio/thanos-bucket",
227     )
229     # Now that s3 has started we can start the other machines:
230     for machine in prometheus, query, store:
231         machine.start()
233     # Check if prometheus responds to requests:
234     prometheus.wait_for_unit("prometheus.service")
236     prometheus.wait_for_open_port(${toString queryPort})
237     prometheus.succeed("curl -sf http://127.0.0.1:${toString queryPort}/metrics")
239     # Let's test if pushing a metric to the pushgateway succeeds:
240     prometheus.wait_for_unit("pushgateway.service")
241     prometheus.succeed(
242         "echo 'some_metric 3.14' | "
243         + "curl -f --data-binary \@- "
244         + "http://127.0.0.1:${toString pushgwPort}/metrics/job/some_job"
245     )
247     # Now check whether that metric gets ingested by prometheus.
248     # Since we'll check for the metric several times on different machines
249     # we abstract the test using the following function:
251     # Function to check if the metric "some_metric" has been received and returns the correct value.
252     def wait_for_metric(machine):
253         return machine.wait_until_succeeds(
254             "curl -sf 'http://127.0.0.1:${toString queryPort}/api/v1/query?query=some_metric' | "
255             + "jq '.data.result[0].value[1]' | grep '\"3.14\"'"
256         )
259     wait_for_metric(prometheus)
261     # Let's test if the pushgateway persists metrics to the configured location.
262     prometheus.wait_until_succeeds("test -e /var/lib/prometheus-pushgateway/metrics")
264     # Test thanos
265     prometheus.wait_for_unit("thanos-sidecar.service")
267     # Test if the Thanos query service can correctly retrieve the metric that was send above.
268     query.wait_for_unit("thanos-query.service")
269     wait_for_metric(query)
271     # Test Thanos query frontend service
272     query.wait_for_unit("thanos-query-frontend.service")
273     query.succeed("curl -sS http://localhost:${toString frontPort}/-/healthy")
275     # Test if the Thanos sidecar has correctly uploaded its TSDB to S3, if the
276     # Thanos storage service has correctly downloaded it from S3 and if the Thanos
277     # query service running on $store can correctly retrieve the metric:
278     store.wait_for_unit("thanos-store.service")
279     wait_for_metric(store)
281     store.wait_for_unit("thanos-compact.service")
283     # Test if the Thanos bucket command is able to retrieve blocks from the S3 bucket
284     # and check if the blocks have the correct labels:
285     store.succeed(
286         "thanos tools bucket ls "
287         + "--objstore.config-file=${nodes.store.config.services.thanos.store.objstore.config-file} "
288         + "--output=json | "
289         + "jq .thanos.labels.some_label | "
290         + "grep 'required by thanos'"
291     )
293     # Check if switching to a NixOS configuration that changes the prometheus
294     # configuration reloads (instead of restarts) prometheus before the switch
295     # finishes successfully:
296     with subtest("config change reloads prometheus"):
297         # We check if prometheus has finished reloading by looking for the message
298         # "Completed loading of configuration file" in the journal between the start
299         # and finish of switching to the new NixOS configuration.
300         #
301         # To mark the start we record the journal cursor before starting the switch:
302         cursor_before_switching = json.loads(
303             prometheus.succeed("journalctl -n1 -o json --output-fields=__CURSOR")
304         )["__CURSOR"]
306         # Now we switch:
307         prometheus_config_change = prometheus.succeed(
308             "readlink /run/current-system/specialisation/prometheus-config-change"
309         ).strip()
310         prometheus.succeed(prometheus_config_change + "/bin/switch-to-configuration test")
312         # Next we retrieve all logs since the start of switching:
313         logs_after_starting_switching = prometheus.succeed(
314             """
315               journalctl --after-cursor='{cursor_before_switching}' -o json --output-fields=MESSAGE
316             """.format(
317                 cursor_before_switching=cursor_before_switching
318             )
319         )
321         # Finally we check if the message "Completed loading of configuration file"
322         # occurs before the "finished switching to system configuration" message:
323         finished_switching_msg = (
324             "finished switching to system configuration " + prometheus_config_change
325         )
326         reloaded_before_switching_finished = False
327         finished_switching = False
328         for log_line in logs_after_starting_switching.split("\n"):
329             msg = json.loads(log_line)["MESSAGE"]
330             if "Completed loading of configuration file" in msg:
331                 reloaded_before_switching_finished = True
332             if msg == finished_switching_msg:
333                 finished_switching = True
334                 break
336         assert reloaded_before_switching_finished
337         assert finished_switching
339         # Check if the reloaded config includes the new s3-node_exporter job:
340         prometheus.succeed(
341           """
342             curl -sf http://127.0.0.1:${toString queryPort}/api/v1/status/config \
343               | jq -r .data.yaml \
344               | yq '.scrape_configs | any(.job_name == "s3-node_exporter")' \
345               | grep true
346           """
347         )
348   '';