8 accessKey = "BKIKJAA5BMMU2RHO6IBB";
9 secretKey = "V7f1CwQqAcwo80UEIJEjc5gVQUSSx5ohQ9GSrr12";
15 bucket = "thanos-bucket";
16 endpoint = "s3:${toString minioPort}";
18 access_key = s3.accessKey;
19 secret_key = s3.secretKey;
21 signature_version2 = false;
22 put_user_metadata = {};
24 idle_conn_timeout = "0s";
25 insecure_skip_verify = false;
33 in import ./make-test-python.nix {
37 prometheus = { pkgs, ... }: {
38 virtualisation.diskSize = 2 * 1024;
39 virtualisation.memorySize = 2048;
40 environment.systemPackages = [ pkgs.jq ];
41 networking.firewall.allowedTCPPorts = [ grpcPort ];
42 services.prometheus = {
47 job_name = "prometheus";
50 targets = [ "127.0.0.1:${toString queryPort}" ];
51 labels = { instance = "localhost"; };
56 job_name = "pushgateway";
57 scrape_interval = "1s";
60 targets = [ "127.0.0.1:${toString pushgwPort}" ];
71 expr: count(up{job="prometheus"})
76 some_label = "required by thanos";
81 "--storage.tsdb.min-block-duration=5s"
82 "--storage.tsdb.max-block-duration=5s"
85 services.prometheus.pushgateway = {
87 web.listen-address = ":${toString pushgwPort}";
88 persistMetrics = true;
89 persistence.interval = "1s";
90 stateDir = "prometheus-pushgateway";
95 grpc-address = "0.0.0.0:${toString grpcPort}";
99 # TODO: Add some tests for these services:
102 # http-address = "0.0.0.0:19194";
103 # grpc-address = "0.0.0.0:19193";
104 # query.addresses = [
114 # http-address = "0.0.0.0:19195";
122 # Adds a "specialisation" of the above config which allows us to
123 # "switch" to it and see if the services.prometheus.enableReload
124 # functionality actually reloads the prometheus service instead of
127 "prometheus-config-change" = {
129 environment.systemPackages = [ pkgs.yq ];
131 # This configuration just adds a new prometheus job
132 # to scrape the node_exporter metrics of the s3 machine.
133 services.prometheus = {
136 job_name = "s3-node_exporter";
139 targets = [ "s3:9100" ];
150 query = { pkgs, ... }: {
151 environment.systemPackages = [ pkgs.jq ];
152 services.thanos.query = {
154 http-address = "0.0.0.0:${toString queryPort}";
156 "prometheus:${toString grpcPort}"
161 store = { pkgs, ... }: {
162 virtualisation.diskSize = 2 * 1024;
163 virtualisation.memorySize = 2048;
164 environment.systemPackages = with pkgs; [ jq thanos ];
165 services.thanos.store = {
167 http-address = "0.0.0.0:10902";
168 grpc-address = "0.0.0.0:${toString grpcPort}";
170 sync-block-duration = "1s";
172 services.thanos.compact = {
174 http-address = "0.0.0.0:10903";
176 consistency-delay = "5s";
178 services.thanos.query = {
180 http-address = "0.0.0.0:${toString queryPort}";
182 "localhost:${toString grpcPort}"
187 s3 = { pkgs, ... } : {
188 # Minio requires at least 1GiB of free disk space to run.
192 networking.firewall.allowedTCPPorts = [ minioPort ];
196 inherit (s3) accessKey secretKey;
199 environment.systemPackages = [ pkgs.minio-client ];
201 services.prometheus.exporters.node = {
208 testScript = { nodes, ... } : ''
211 # Before starting the other machines we first make sure that our S3 service is online
212 # and has a bucket added for thanos:
214 s3.wait_for_unit("minio.service")
215 s3.wait_for_open_port(${toString minioPort})
217 "mc config host add minio "
218 + "http://localhost:${toString minioPort} "
219 + "${s3.accessKey} ${s3.secretKey} --api s3v4",
220 "mc mb minio/thanos-bucket",
223 # Now that s3 has started we can start the other machines:
224 for machine in prometheus, query, store:
227 # Check if prometheus responds to requests:
228 prometheus.wait_for_unit("prometheus.service")
230 prometheus.wait_for_open_port(${toString queryPort})
231 prometheus.succeed("curl -sf http://127.0.0.1:${toString queryPort}/metrics")
233 # Let's test if pushing a metric to the pushgateway succeeds:
234 prometheus.wait_for_unit("pushgateway.service")
236 "echo 'some_metric 3.14' | "
237 + "curl -f --data-binary \@- "
238 + "http://127.0.0.1:${toString pushgwPort}/metrics/job/some_job"
241 # Now check whether that metric gets ingested by prometheus.
242 # Since we'll check for the metric several times on different machines
243 # we abstract the test using the following function:
245 # Function to check if the metric "some_metric" has been received and returns the correct value.
246 def wait_for_metric(machine):
247 return machine.wait_until_succeeds(
248 "curl -sf 'http://127.0.0.1:${toString queryPort}/api/v1/query?query=some_metric' | "
249 + "jq '.data.result[0].value[1]' | grep '\"3.14\"'"
253 wait_for_metric(prometheus)
255 # Let's test if the pushgateway persists metrics to the configured location.
256 prometheus.wait_until_succeeds("test -e /var/lib/prometheus-pushgateway/metrics")
259 prometheus.wait_for_unit("thanos-sidecar.service")
261 # Test if the Thanos query service can correctly retrieve the metric that was send above.
262 query.wait_for_unit("thanos-query.service")
263 wait_for_metric(query)
265 # Test if the Thanos sidecar has correctly uploaded its TSDB to S3, if the
266 # Thanos storage service has correctly downloaded it from S3 and if the Thanos
267 # query service running on $store can correctly retrieve the metric:
268 store.wait_for_unit("thanos-store.service")
269 wait_for_metric(store)
271 store.wait_for_unit("thanos-compact.service")
273 # Test if the Thanos bucket command is able to retrieve blocks from the S3 bucket
274 # and check if the blocks have the correct labels:
276 "thanos tools bucket ls "
277 + "--objstore.config-file=${nodes.store.config.services.thanos.store.objstore.config-file} "
279 + "jq .thanos.labels.some_label | "
280 + "grep 'required by thanos'"
283 # Check if switching to a NixOS configuration that changes the prometheus
284 # configuration reloads (instead of restarts) prometheus before the switch
285 # finishes successfully:
286 with subtest("config change reloads prometheus"):
287 # We check if prometheus has finished reloading by looking for the message
288 # "Completed loading of configuration file" in the journal between the start
289 # and finish of switching to the new NixOS configuration.
291 # To mark the start we record the journal cursor before starting the switch:
292 cursor_before_switching = json.loads(
293 prometheus.succeed("journalctl -n1 -o json --output-fields=__CURSOR")
297 prometheus_config_change = prometheus.succeed(
298 "readlink /run/current-system/specialisation/prometheus-config-change"
300 prometheus.succeed(prometheus_config_change + "/bin/switch-to-configuration test")
302 # Next we retrieve all logs since the start of switching:
303 logs_after_starting_switching = prometheus.succeed(
305 journalctl --after-cursor='{cursor_before_switching}' -o json --output-fields=MESSAGE
307 cursor_before_switching=cursor_before_switching
311 # Finally we check if the message "Completed loading of configuration file"
312 # occurs before the "finished switching to system configuration" message:
313 finished_switching_msg = (
314 "finished switching to system configuration " + prometheus_config_change
316 reloaded_before_switching_finished = False
317 finished_switching = False
318 for log_line in logs_after_starting_switching.split("\n"):
319 msg = json.loads(log_line)["MESSAGE"]
320 if "Completed loading of configuration file" in msg:
321 reloaded_before_switching_finished = True
322 if msg == finished_switching_msg:
323 finished_switching = True
326 assert reloaded_before_switching_finished
327 assert finished_switching
329 # Check if the reloaded config includes the new s3-node_exporter job:
332 curl -sf http://127.0.0.1:${toString queryPort}/api/v1/status/config \
334 | yq '.scrape_configs | any(.job_name == "s3-node_exporter")' \