diff --git a/nixos/modules/module-list.nix b/nixos/modules/module-list.nix
index 8c6bc47df25..3f23151a4e3 100644
--- a/nixos/modules/module-list.nix
+++ b/nixos/modules/module-list.nix
@@ -519,6 +519,7 @@
./services/monitoring/systemhealth.nix
./services/monitoring/teamviewer.nix
./services/monitoring/telegraf.nix
+ ./services/monitoring/thanos.nix
./services/monitoring/ups.nix
./services/monitoring/uptime.nix
./services/monitoring/vnstat.nix
diff --git a/nixos/modules/services/monitoring/thanos.nix b/nixos/modules/services/monitoring/thanos.nix
new file mode 100644
index 00000000000..a34a1ecbfea
--- /dev/null
+++ b/nixos/modules/services/monitoring/thanos.nix
@@ -0,0 +1,756 @@
+{ config, lib, pkgs, ... }:
+
+with lib;
+
+let
+ cfg = config.services.thanos;
+
+ nullOpt = type : description : mkOption {
+ type = types.nullOr type;
+ default = null;
+ inherit description;
+ };
+
+ optionToArgs = opt : v : optional (v != null) ''--${opt}="${toString v}"'';
+ flagToArgs = opt : v : optional v ''--${opt}'';
+ listToArgs = opt : vs : map (v: ''--${opt}="${v}"'') vs;
+ attrsToArgs = opt : kvs : mapAttrsToList (k: v: ''--${opt}=${k}=\"${v}\"'') kvs;
+
+ mkParamDef = type : default : description : mkParam type (description + ''
+
+ Defaults to ${toString default} in Thanos
+ when set to null.
+ '');
+
+ mkParam = type : description : {
+ toArgs = optionToArgs;
+ option = nullOpt type description;
+ };
+
+ mkFlagParam = description : {
+ toArgs = flagToArgs;
+ option = mkOption {
+ type = types.bool;
+ default = false;
+ inherit description;
+ };
+ };
+
+ mkListParam = opt : description : {
+ toArgs = _opt : listToArgs opt;
+ option = mkOption {
+ type = types.listOf types.str;
+ default = [];
+ inherit description;
+ };
+ };
+
+ mkAttrsParam = opt : description : {
+ toArgs = _opt : attrsToArgs opt;
+ option = mkOption {
+ type = types.attrsOf types.str;
+ default = {};
+ inherit description;
+ };
+ };
+
+ mkStateDirParam = opt : default : description : {
+ toArgs = _opt : stateDir : optionToArgs opt "/var/lib/${stateDir}";
+ option = mkOption {
+ type = types.str;
+ inherit default;
+ inherit description;
+ };
+ };
+
+ toYAML = name : attrs : pkgs.runCommandNoCC name {
+ preferLocalBuild = true;
+ json = builtins.toFile "${name}.json" (builtins.toJSON attrs);
+ nativeBuildInputs = [ pkgs.remarshal ];
+ } ''json2yaml -i $json -o $out'';
+
+ thanos = cmd : "${cfg.package}/bin/thanos ${cmd}" +
+ (let args = concatLists (collect isList
+ (flip mapParamsRecursive params."${cmd}" (path : param :
+ let opt = concatStringsSep "." path;
+ v = getAttrFromPath path cfg."${cmd}";
+ in param.toArgs opt v)));
+ in optionalString (length args != 0) (" \\\n " +
+ concatStringsSep " \\\n " args));
+
+ mapParamsRecursive =
+ let noParam = attr : !(attr ? "toArgs" && attr ? "option");
+ in mapAttrsRecursiveCond noParam;
+
+ paramsToOptions = mapParamsRecursive (_path : param : param.option);
+
+ params = {
+
+ log = {
+
+ log.level = mkParamDef (types.enum ["debug" "info" "warn" "error" "fatal"]) "info" ''
+ Log filtering level.
+ '';
+
+ log.format = mkParam types.str ''
+ Log format to use.
+ '';
+ };
+
+ gcloudtrace = {
+
+ gcloudtrace.project = mkParam types.str ''
+ GCP project to send Google Cloud Trace tracings to.
+
+ If null, tracing will be disabled.
+ '';
+
+ gcloudtrace.sample-factor = mkParamDef types.int 1 ''
+ How often we send traces 1/<sample-factor>.
+
+ If 0 no trace will be sent periodically, unless
+ forced by baggage item.
+ '';
+ };
+
+ common = params.log // params.gcloudtrace // {
+
+ http-address = mkParamDef types.str "0.0.0.0:10902" ''
+ Listen host:port for HTTP endpoints.
+ '';
+
+ grpc-address = mkParamDef types.str "0.0.0.0:10901" ''
+ Listen ip:port address for gRPC endpoints (StoreAPI).
+
+ Make sure this address is routable from other components if you use gossip,
+ is empty and you require cross-node connection.
+ '';
+
+ grpc-server-tls-cert = mkParam types.str ''
+ TLS Certificate for gRPC server, leave blank to disable TLS
+ '';
+
+ grpc-server-tls-key = mkParam types.str ''
+ TLS Key for the gRPC server, leave blank to disable TLS
+ '';
+
+ grpc-server-tls-client-ca = mkParam types.str ''
+ TLS CA to verify clients against.
+
+ If no client CA is specified, there is no client verification on server side.
+ (tls.NoClientCert)
+ '';
+ };
+
+ objstore = cfg : {
+
+ objstore.config-file = {
+ toArgs = _opt : path : optionToArgs "objstore.config-file" path;
+ option = mkOption {
+ type = with types; nullOr str;
+ default = if cfg.objstore.config == null then null
+ else toString (toYAML "objstore.yaml" cfg.objstore.config);
+ defaultText = ''
+ if config.services.thanos..objstore.config == null then null
+ else toString (toYAML "objstore.yaml" config.services.thanos..objstore.config);
+ '';
+ description = ''
+ Path to YAML file that contains object store configuration.
+ '';
+ };
+ };
+
+ objstore.config =
+ {
+ toArgs = _opt : _attrs : [];
+ option = nullOpt types.attrs ''
+ Object store configuration.
+
+ When not null the attribute set gets converted to
+ a YAML file and stored in the Nix store. The option
+ will default to its path.
+
+ If is set this option has no effect.
+ '';
+ };
+ };
+
+ sidecar = params.common // params.objstore cfg.sidecar // {
+
+ prometheus.url = mkParamDef types.str "http://localhost:9090" ''
+ URL at which to reach Prometheus's API.
+
+ For better performance use local network.
+ '';
+
+ tsdb.path = {
+ toArgs = optionToArgs;
+ option = mkOption {
+ type = types.str;
+ default = "/var/lib/${config.services.prometheus2.stateDir}/data";
+ defaultText = "/var/lib/\${config.services.prometheus2.stateDir}/data";
+ description = ''
+ Data directory of TSDB.
+ '';
+ };
+ };
+
+ reloader.config-file = mkParam types.str ''
+ Config file watched by the reloader.
+ '';
+
+ reloader.config-envsubst-file = mkParam types.str ''
+ Output file for environment variable substituted config file.
+ '';
+
+ reloader.rule-dirs = mkListParam "reloader.rule-dir" ''
+ Rule directories for the reloader to refresh.
+ '';
+
+ };
+
+ store = params.common // params.objstore cfg.store // {
+
+ stateDir = mkStateDirParam "data-dir" "thanos-store" ''
+ Data directory relative to /var/lib
+ in which to cache remote blocks.
+ '';
+
+ index-cache-size = mkParamDef types.str "250MB" ''
+ Maximum size of items held in the index cache.
+ '';
+
+ chunk-pool-size = mkParamDef types.str "2GB" ''
+ Maximum size of concurrently allocatable bytes for chunks.
+ '';
+
+ store.grpc.series-sample-limit = mkParamDef types.int 0 ''
+ Maximum amount of samples returned via a single Series call.
+
+ 0 means no limit.
+
+ NOTE: for efficiency we take 120 as the number of samples in chunk (it
+ cannot be bigger than that), so the actual number of samples might be
+ lower, even though the maximum could be hit.
+ '';
+
+ store.grpc.series-max-concurrency = mkParamDef types.int 20 ''
+ Maximum number of concurrent Series calls.
+ '';
+
+ sync-block-duration = mkParamDef types.str "3m" ''
+ Repeat interval for syncing the blocks between local and remote view.
+ '';
+
+ block-sync-concurrency = mkParamDef types.int 20 ''
+ Number of goroutines to use when syncing blocks from object storage.
+ '';
+ };
+
+ query = params.common // {
+
+ http-advertise-address = mkParam types.str ''
+ Explicit (external) host:port address to advertise
+ for HTTP QueryAPI in gossip cluster.
+
+ If null, the option
+ will be used.
+ '';
+
+ grpc-client-tls-secure = mkFlagParam ''
+ Use TLS when talking to the gRPC server
+ '';
+
+ grpc-client-tls-cert = mkParam types.str ''
+ TLS Certificates to use to identify this client to the server
+ '';
+
+ grpc-client-tls-key = mkParam types.str ''
+ TLS Key for the client's certificate
+ '';
+
+ grpc-client-tls-ca = mkParam types.str ''
+ TLS CA Certificates to use to verify gRPC servers
+ '';
+
+ grpc-client-server-name = mkParam types.str ''
+ Server name to verify the hostname on the returned gRPC certificates.
+ See
+ '';
+
+ web.route-prefix = mkParam types.str ''
+ Prefix for API and UI endpoints.
+
+ This allows thanos UI to be served on a sub-path. This option is
+ analogous to of Promethus.
+ '';
+
+ web.external-prefix = mkParam types.str ''
+ Static prefix for all HTML links and redirect URLs in the UI query web
+ interface.
+
+ Actual endpoints are still served on / or the
+ . This allows thanos UI to be served
+ behind a reverse proxy that strips a URL sub-path.
+ '';
+
+ web.prefix-header = mkParam types.str ''
+ Name of HTTP request header used for dynamic prefixing of UI links and
+ redirects.
+
+ This option is ignored if the option
+ web.external-prefix is set.
+
+ Security risk: enable this option only if a reverse proxy in front of
+ thanos is resetting the header.
+
+ The setting web.prefix-header="X-Forwarded-Prefix"
+ can be useful, for example, if Thanos UI is served via Traefik reverse
+ proxy with PathPrefixStrip option enabled, which
+ sends the stripped prefix value in X-Forwarded-Prefix
+ header. This allows thanos UI to be served on a sub-path.
+ '';
+
+ query.timeout = mkParamDef types.str "2m" ''
+ Maximum time to process query by query node.
+ '';
+
+ query.max-concurrent = mkParamDef types.int 20 ''
+ Maximum number of queries processed concurrently by query node.
+ '';
+
+ query.replica-label = mkParam types.str ''
+ Label to treat as a replica indicator along which data is
+ deduplicated.
+
+ Still you will be able to query without deduplication using
+ dedup=false parameter.
+ '';
+
+ selector-labels = mkAttrsParam "selector-label" ''
+ Query selector labels that will be exposed in info endpoint.
+ '';
+
+ store.addresses = mkListParam "store" ''
+ Addresses of statically configured store API servers.
+
+ The scheme may be prefixed with dns+ or
+ dnssrv+ to detect store API servers through
+ respective DNS lookups.
+ '';
+
+ store.sd-files = mkListParam "store.sd-files" ''
+ Path to files that contain addresses of store API servers. The path
+ can be a glob pattern.
+ '';
+
+ store.sd-interval = mkParamDef types.str "5m" ''
+ Refresh interval to re-read file SD files. It is used as a resync fallback.
+ '';
+
+ store.sd-dns-interval = mkParamDef types.str "30s" ''
+ Interval between DNS resolutions.
+ '';
+
+ store.unhealthy-timeout = mkParamDef types.str "5m" ''
+ Timeout before an unhealthy store is cleaned from the store UI page.
+ '';
+
+ query.auto-downsampling = mkFlagParam ''
+ Enable automatic adjustment (step / 5) to what source of data should
+ be used in store gateways if no
+ max_source_resolution param is specified.
+ '';
+
+ query.partial-response = mkFlagParam ''
+ Enable partial response for queries if no
+ partial_response param is specified.
+ '';
+
+ query.default-evaluation-interval = mkParamDef types.str "1m" ''
+ Set default evaluation interval for sub queries.
+ '';
+
+ store.response-timeout = mkParamDef types.str "0ms" ''
+ If a Store doesn't send any data in this specified duration then a
+ Store will be ignored and partial data will be returned if it's
+ enabled. 0 disables timeout.
+ '';
+ };
+
+ rule = params.common // params.objstore cfg.rule // {
+
+ labels = mkAttrsParam "label" ''
+ Labels to be applied to all generated metrics.
+
+ Similar to external labels for Prometheus,
+ used to identify ruler and its blocks as unique source.
+ '';
+
+ stateDir = mkStateDirParam "data-dir" "thanos-rule" ''
+ Data directory relative to /var/lib.
+ '';
+
+ rule-files = mkListParam "rule-file" ''
+ Rule files that should be used by rule manager. Can be in glob format.
+ '';
+
+ eval-interval = mkParamDef types.str "30s" ''
+ The default evaluation interval to use.
+ '';
+
+ tsdb.block-duration = mkParamDef types.str "2h" ''
+ Block duration for TSDB block.
+ '';
+
+ tsdb.retention = mkParamDef types.str "48h" ''
+ Block retention time on local disk.
+ '';
+
+ alertmanagers.urls = mkListParam "alertmanagers.url" ''
+ Alertmanager replica URLs to push firing alerts.
+
+ Ruler claims success if push to at least one alertmanager from
+ discovered succeeds. The scheme may be prefixed with
+ dns+ or dnssrv+ to detect
+ Alertmanager IPs through respective DNS lookups. The port defaults to
+ 9093 or the SRV record's value. The URL path is
+ used as a prefix for the regular Alertmanager API path.
+ '';
+
+ alertmanagers.send-timeout = mkParamDef types.str "10s" ''
+ Timeout for sending alerts to alertmanager.
+ '';
+
+ alert.query-url = mkParam types.str ''
+ The external Thanos Query URL that would be set in all alerts 'Source' field.
+ '';
+
+ alert.label-drop = mkListParam "alert.label-drop" ''
+ Labels by name to drop before sending to alertmanager.
+
+ This allows alert to be deduplicated on replica label.
+
+ Similar Prometheus alert relabelling
+ '';
+
+ web.route-prefix = mkParam types.str ''
+ Prefix for API and UI endpoints.
+
+ This allows thanos UI to be served on a sub-path.
+
+ This option is analogous to --web.route-prefix of Promethus.
+ '';
+
+ web.external-prefix = mkParam types.str ''
+ Static prefix for all HTML links and redirect URLs in the UI query web
+ interface.
+
+ Actual endpoints are still served on / or the
+ . This allows thanos UI to be served
+ behind a reverse proxy that strips a URL sub-path.
+ '';
+
+ web.prefix-header = mkParam types.str ''
+ Name of HTTP request header used for dynamic prefixing of UI links and
+ redirects.
+
+ This option is ignored if the option
+ is set.
+
+ Security risk: enable this option only if a reverse proxy in front of
+ thanos is resetting the header.
+
+ The header X-Forwarded-Prefix can be useful, for
+ example, if Thanos UI is served via Traefik reverse proxy with
+ PathPrefixStrip option enabled, which sends the
+ stripped prefix value in X-Forwarded-Prefix
+ header. This allows thanos UI to be served on a sub-path.
+ '';
+
+ query.addresses = mkListParam "query" ''
+ Addresses of statically configured query API servers.
+
+ The scheme may be prefixed with dns+ or
+ dnssrv+ to detect query API servers through
+ respective DNS lookups.
+ '';
+
+ query.sd-files = mkListParam "query.sd-files" ''
+ Path to file that contain addresses of query peers.
+ The path can be a glob pattern.
+ '';
+
+ query.sd-interval = mkParamDef types.str "5m" ''
+ Refresh interval to re-read file SD files. (used as a fallback)
+ '';
+
+ query.sd-dns-interval = mkParamDef types.str "30s" ''
+ Interval between DNS resolutions.
+ '';
+ };
+
+ compact = params.log // params.gcloudtrace // params.objstore cfg.compact // {
+
+ http-address = mkParamDef types.str "0.0.0.0:10902" ''
+ Listen host:port for HTTP endpoints.
+ '';
+
+ stateDir = mkStateDirParam "data-dir" "thanos-compact" ''
+ Data directory relative to /var/lib
+ in which to cache blocks and process compactions.
+ '';
+
+ consistency-delay = mkParamDef types.str "30m" ''
+ Minimum age of fresh (non-compacted) blocks before they are being
+ processed. Malformed blocks older than the maximum of consistency-delay
+ and 30m0s will be removed.
+ '';
+
+ retention.resolution-raw = mkParamDef types.str "0d" ''
+ How long to retain raw samples in bucket.
+
+ 0d - disables this retention
+ '';
+
+ retention.resolution-5m = mkParamDef types.str "0d" ''
+ How long to retain samples of resolution 1 (5 minutes) in bucket.
+
+ 0d - disables this retention
+ '';
+
+ retention.resolution-1h = mkParamDef types.str "0d" ''
+ How long to retain samples of resolution 2 (1 hour) in bucket.
+
+ 0d - disables this retention
+ '';
+
+ startAt = {
+ toArgs = _opt : startAt : flagToArgs "wait" (startAt == null);
+ option = nullOpt types.str ''
+ When this option is set to a systemd.time
+ specification the Thanos compactor will run at the specified period.
+
+ When this option is null the Thanos compactor service
+ will run continuously. So it will not exit after all compactions have
+ been processed but wait for new work.
+ '';
+ };
+
+ block-sync-concurrency = mkParamDef types.int 20 ''
+ Number of goroutines to use when syncing block metadata from object storage.
+ '';
+
+ compact.concurrency = mkParamDef types.int 1 ''
+ Number of goroutines to use when compacting groups.
+ '';
+ };
+
+ downsample = params.log // params.gcloudtrace // params.objstore cfg.downsample // {
+
+ stateDir = mkStateDirParam "data-dir" "thanos-downsample" ''
+ Data directory relative to /var/lib
+ in which to cache blocks and process downsamplings.
+ '';
+
+ };
+
+ receive = params.common // params.objstore cfg.receive // {
+
+ remote-write.address = mkParamDef types.str "0.0.0.0:19291" ''
+ Address to listen on for remote write requests.
+ '';
+
+ stateDir = mkStateDirParam "tsdb.path" "thanos-receive" ''
+ Data directory relative to /var/lib of TSDB.
+ '';
+
+ labels = mkAttrsParam "labels" ''
+ External labels to announce.
+
+ This flag will be removed in the future when handling multiple tsdb
+ instances is added.
+ '';
+
+ tsdb.retention = mkParamDef types.str "15d" ''
+ How long to retain raw samples on local storage.
+
+ 0d - disables this retention
+ '';
+ };
+
+ };
+
+ assertRelativeStateDir = cmd : {
+ assertions = [
+ {
+ assertion = !hasPrefix "/" cfg."${cmd}".stateDir;
+ message =
+ "The option services.thanos.${cmd}.stateDir should not be an absolute directory." +
+ " It should be a directory relative to /var/lib.";
+ }
+ ];
+ };
+
+in {
+
+ options.services.thanos = {
+
+ package = mkOption {
+ type = types.package;
+ default = pkgs.thanos;
+ defaultText = "pkgs.thanos";
+ description = ''
+ The thanos package that should be used.
+ '';
+ };
+
+ sidecar = paramsToOptions params.sidecar // {
+ enable = mkEnableOption
+ "the Thanos sidecar for Prometheus server";
+ };
+
+ store = paramsToOptions params.store // {
+ enable = mkEnableOption
+ "the Thanos store node giving access to blocks in a bucket provider.";
+ };
+
+ query = paramsToOptions params.query // {
+ enable = mkEnableOption
+ ("the Thanos query node exposing PromQL enabled Query API " +
+ "with data retrieved from multiple store nodes");
+ };
+
+ rule = paramsToOptions params.rule // {
+ enable = mkEnableOption
+ ("the Thanos ruler service which evaluates Prometheus rules against" +
+ " given Query nodes, exposing Store API and storing old blocks in bucket");
+ };
+
+ compact = paramsToOptions params.compact // {
+ enable = mkEnableOption
+ "the Thanos compactor which continuously compacts blocks in an object store bucket";
+ };
+
+ downsample = paramsToOptions params.downsample // {
+ enable = mkEnableOption
+ "the Thanos downsampler which continuously downsamples blocks in an object store bucket";
+ };
+
+ receive = paramsToOptions params.receive // {
+ enable = mkEnableOption
+ ("the Thanos receiver which accept Prometheus remote write API requests " +
+ "and write to local tsdb (EXPERIMENTAL, this may change drastically without notice)");
+ };
+ };
+
+ config = mkMerge [
+
+ (mkIf cfg.sidecar.enable {
+ systemd.services.thanos-sidecar = {
+ wantedBy = [ "multi-user.target" ];
+ after = [ "network.target" "prometheus2.service" ];
+ serviceConfig = {
+ User = "prometheus";
+ Restart = "always";
+ ExecStart = thanos "sidecar";
+ };
+ };
+ })
+
+ (mkIf cfg.store.enable (mkMerge [
+ (assertRelativeStateDir "store")
+ {
+ systemd.services.thanos-store = {
+ wantedBy = [ "multi-user.target" ];
+ after = [ "network.target" ];
+ serviceConfig = {
+ DynamicUser = true;
+ StateDirectory = cfg.store.stateDir;
+ Restart = "always";
+ ExecStart = thanos "store";
+ };
+ };
+ }
+ ]))
+
+ (mkIf cfg.query.enable {
+ systemd.services.thanos-query = {
+ wantedBy = [ "multi-user.target" ];
+ after = [ "network.target" ];
+ serviceConfig = {
+ DynamicUser = true;
+ Restart = "always";
+ ExecStart = thanos "query";
+ };
+ };
+ })
+
+ (mkIf cfg.rule.enable (mkMerge [
+ (assertRelativeStateDir "rule")
+ {
+ systemd.services.thanos-rule = {
+ wantedBy = [ "multi-user.target" ];
+ after = [ "network.target" ];
+ serviceConfig = {
+ DynamicUser = true;
+ StateDirectory = cfg.rule.stateDir;
+ Restart = "always";
+ ExecStart = thanos "rule";
+ };
+ };
+ }
+ ]))
+
+ (mkIf cfg.compact.enable (mkMerge [
+ (assertRelativeStateDir "compact")
+ {
+ systemd.services.thanos-compact =
+ let wait = cfg.compact.startAt == null; in {
+ wantedBy = [ "multi-user.target" ];
+ after = [ "network.target" ];
+ serviceConfig = {
+ Type = if wait then "simple" else "oneshot";
+ Restart = if wait then "always" else "no";
+ DynamicUser = true;
+ StateDirectory = cfg.compact.stateDir;
+ ExecStart = thanos "compact";
+ };
+ } // optionalAttrs (!wait) { inherit (cfg.compact) startAt; };
+ }
+ ]))
+
+ (mkIf cfg.downsample.enable (mkMerge [
+ (assertRelativeStateDir "downsample")
+ {
+ systemd.services.thanos-downsample = {
+ wantedBy = [ "multi-user.target" ];
+ after = [ "network.target" ];
+ serviceConfig = {
+ DynamicUser = true;
+ StateDirectory = cfg.downsample.stateDir;
+ Restart = "always";
+ ExecStart = thanos "downsample";
+ };
+ };
+ }
+ ]))
+
+ (mkIf cfg.receive.enable (mkMerge [
+ (assertRelativeStateDir "receive")
+ {
+ systemd.services.thanos-receive = {
+ wantedBy = [ "multi-user.target" ];
+ after = [ "network.target" ];
+ serviceConfig = {
+ DynamicUser = true;
+ StateDirectory = cfg.receive.stateDir;
+ Restart = "always";
+ ExecStart = thanos "receive";
+ };
+ };
+ }
+ ]))
+
+ ];
+}
diff --git a/nixos/tests/prometheus-2.nix b/nixos/tests/prometheus-2.nix
index d7035d49ad4..3e2c675b7c6 100644
--- a/nixos/tests/prometheus-2.nix
+++ b/nixos/tests/prometheus-2.nix
@@ -1,9 +1,44 @@
-import ./make-test.nix {
+let
+ grpcPort = 19090;
+ queryPort = 9090;
+ minioPort = 9000;
+ pushgwPort = 9091;
+
+ s3 = {
+ accessKey = "BKIKJAA5BMMU2RHO6IBB";
+ secretKey = "V7f1CwQqAcwo80UEIJEjc5gVQUSSx5ohQ9GSrr12";
+ };
+
+ objstore.config = {
+ type = "S3";
+ config = {
+ bucket = "thanos-bucket";
+ endpoint = "s3:${toString minioPort}";
+ region = "us-east-1";
+ access_key = s3.accessKey;
+ secret_key = s3.secretKey;
+ insecure = true;
+ signature_version2 = false;
+ encrypt_sse = false;
+ put_user_metadata = {};
+ http_config = {
+ idle_conn_timeout = "0s";
+ insecure_skip_verify = false;
+ };
+ trace = {
+ enable = false;
+ };
+ };
+ };
+
+in import ./make-test.nix {
name = "prometheus-2";
nodes = {
- one = { pkgs, ... }: {
+ prometheus = { pkgs, ... }: {
+ virtualisation.diskSize = 2 * 1024;
environment.systemPackages = [ pkgs.jq ];
+ networking.firewall.allowedTCPPorts = [ grpcPort ];
services.prometheus2 = {
enable = true;
scrapeConfigs = [
@@ -11,7 +46,7 @@ import ./make-test.nix {
job_name = "prometheus";
static_configs = [
{
- targets = [ "127.0.0.1:9090" ];
+ targets = [ "127.0.0.1:${toString queryPort}" ];
labels = { instance = "localhost"; };
}
];
@@ -21,7 +56,7 @@ import ./make-test.nix {
scrape_interval = "1s";
static_configs = [
{
- targets = [ "127.0.0.1:9091" ];
+ targets = [ "127.0.0.1:${toString pushgwPort}" ];
}
];
}
@@ -35,33 +70,169 @@ import ./make-test.nix {
expr: count(up{job="prometheus"})
''
];
+ globalConfig = {
+ external_labels = {
+ some_label = "required by thanos";
+ };
+ };
+ extraFlags = [
+ # Required by thanos
+ "--storage.tsdb.min-block-duration=5s"
+ "--storage.tsdb.max-block-duration=5s"
+ ];
};
services.prometheus.pushgateway = {
enable = true;
+ web.listen-address = ":${toString pushgwPort}";
persistMetrics = true;
persistence.interval = "1s";
stateDir = "prometheus-pushgateway";
};
+ services.thanos = {
+ sidecar = {
+ enable = true;
+ grpc-address = "0.0.0.0:${toString grpcPort}";
+ inherit objstore;
+ };
+
+ # TODO: Add some tests for these services:
+ #rule = {
+ # enable = true;
+ # http-address = "0.0.0.0:19194";
+ # grpc-address = "0.0.0.0:19193";
+ # query.addresses = [
+ # "localhost:19191"
+ # ];
+ # labels = {
+ # just = "some";
+ # nice = "labels";
+ # };
+ #};
+ #
+ #receive = {
+ # http-address = "0.0.0.0:19195";
+ # enable = true;
+ # labels = {
+ # just = "some";
+ # nice = "labels";
+ # };
+ #};
+ };
+ };
+
+ query = { pkgs, ... }: {
+ environment.systemPackages = [ pkgs.jq ];
+ services.thanos.query = {
+ enable = true;
+ http-address = "0.0.0.0:${toString queryPort}";
+ store.addresses = [
+ "prometheus:${toString grpcPort}"
+ ];
+ };
+ };
+
+ store = { pkgs, ... }: {
+ environment.systemPackages = with pkgs; [ jq thanos ];
+ services.thanos.store = {
+ enable = true;
+ http-address = "0.0.0.0:10902";
+ grpc-address = "0.0.0.0:${toString grpcPort}";
+ inherit objstore;
+ sync-block-duration = "1s";
+ };
+ services.thanos.compact = {
+ enable = true;
+ http-address = "0.0.0.0:10903";
+ inherit objstore;
+ consistency-delay = "5s";
+ };
+ services.thanos.query = {
+ enable = true;
+ http-address = "0.0.0.0:${toString queryPort}";
+ store.addresses = [
+ "localhost:${toString grpcPort}"
+ ];
+ };
+ };
+
+ s3 = { pkgs, ... } : {
+ # Minio requires at least 1GiB of free disk space to run.
+ virtualisation.diskSize = 2 * 1024;
+ networking.firewall.allowedTCPPorts = [ minioPort ];
+
+ services.minio = {
+ enable = true;
+ inherit (s3) accessKey secretKey;
+ };
+
+ environment.systemPackages = [ pkgs.minio-client ];
};
};
- testScript = ''
- startAll;
- $one->waitForUnit("prometheus2.service");
- $one->waitForOpenPort(9090);
- $one->succeed("curl -s http://127.0.0.1:9090/metrics");
+ testScript = { nodes, ... } : ''
+ # Before starting the other machines we first make sure that our S3 service is online
+ # and has a bucket added for thanos:
+ $s3->start;
+ $s3->waitForUnit("minio.service");
+ $s3->waitForOpenPort(${toString minioPort});
+ $s3->succeed(
+ "mc config host add minio " .
+ "http://localhost:${toString minioPort} ${s3.accessKey} ${s3.secretKey} S3v4");
+ $s3->succeed("mc mb minio/thanos-bucket");
- # Let's test if pushing a metric to the pushgateway succeeds
- # and whether that metric gets ingested by prometheus.
- $one->waitForUnit("pushgateway.service");
- $one->succeed(
+ # Now that s3 has started we can start the other machines:
+ $prometheus->start;
+ $query->start;
+ $store->start;
+
+ # Check if prometheus responds to requests:
+ $prometheus->waitForUnit("prometheus2.service");
+ $prometheus->waitForOpenPort(${toString queryPort});
+ $prometheus->succeed("curl -s http://127.0.0.1:${toString queryPort}/metrics");
+
+ # Let's test if pushing a metric to the pushgateway succeeds:
+ $prometheus->waitForUnit("pushgateway.service");
+ $prometheus->succeed(
"echo 'some_metric 3.14' | " .
- "curl --data-binary \@- http://127.0.0.1:9091/metrics/job/some_job");
- $one->waitUntilSucceeds(
- "curl -sf 'http://127.0.0.1:9090/api/v1/query?query=some_metric' " .
- "| jq '.data.result[0].value[1]' | grep '\"3.14\"'");
+ "curl --data-binary \@- http://127.0.0.1:${toString pushgwPort}/metrics/job/some_job");
+
+ # Now check whether that metric gets ingested by prometheus.
+ # Since we'll check for the metric several times on different machines
+ # we abstract the test using the following function:
+
+ # Function to check if the metric "some_metric" has been received and returns the correct value.
+ local *Machine::waitForMetric = sub {
+ my ($self) = @_;
+ $self->waitUntilSucceeds(
+ "curl -sf 'http://127.0.0.1:${toString queryPort}/api/v1/query?query=some_metric' " .
+ "| jq '.data.result[0].value[1]' | grep '\"3.14\"'");
+ };
+
+ $prometheus->waitForMetric;
# Let's test if the pushgateway persists metrics to the configured location.
- $one->waitUntilSucceeds("test -e /var/lib/prometheus-pushgateway/metrics");
+ $prometheus->waitUntilSucceeds("test -e /var/lib/prometheus-pushgateway/metrics");
+
+ # Test thanos
+ $prometheus->waitForUnit("thanos-sidecar.service");
+
+ # Test if the Thanos query service can correctly retrieve the metric that was send above.
+ $query->waitForUnit("thanos-query.service");
+ $query->waitForMetric;
+
+ # Test if the Thanos sidecar has correctly uploaded its TSDB to S3, if the
+ # Thanos storage service has correctly downloaded it from S3 and if the Thanos
+ # query service running on $store can correctly retrieve the metric:
+ $store->waitForUnit("thanos-store.service");
+ $store->waitForMetric;
+
+ $store->waitForUnit("thanos-compact.service");
+
+ # Test if the Thanos bucket command is able to retrieve blocks from the S3 bucket
+ # and check if the blocks have the correct labels:
+ $store->succeed(
+ "thanos bucket ls" .
+ " --objstore.config-file=${nodes.store.config.services.thanos.store.objstore.config-file}" .
+ " --output=json | jq .thanos.labels.some_label | grep 'required by thanos'");
'';
}