infrastructure/modules/nixos/dgn-monitoring/default.nix
sinavir 57fb87baf8
All checks were successful
Build all the nodes / Jaccess01 (push) Successful in 23s
Run pre-commit on all files / pre-commit (push) Successful in 28s
Build all the nodes / Jaccess04 (push) Successful in 36s
Build all the nodes / bridge01 (push) Successful in 46s
Build all the nodes / geo02 (push) Successful in 48s
Build all the nodes / hypervisor02 (push) Successful in 49s
Build all the nodes / ap01 (push) Successful in 52s
Build all the nodes / hypervisor01 (push) Successful in 1m2s
Build all the nodes / geo01 (push) Successful in 1m3s
Build all the nodes / cof02 (push) Successful in 1m10s
Build all the nodes / build01 (push) Successful in 1m18s
Build all the nodes / compute01 (push) Successful in 1m19s
Build all the nodes / hypervisor03 (push) Successful in 58s
Build all the nodes / netcore02 (push) Successful in 23s
Build all the nodes / netcore01 (push) Successful in 23s
Build all the nodes / iso (push) Successful in 55s
Build all the nodes / lab-router01 (push) Successful in 50s
Build all the nodes / tower01 (push) Successful in 47s
Build all the nodes / storage01 (push) Successful in 54s
Build all the nodes / rescue01 (push) Successful in 1m6s
Build all the nodes / web02 (push) Successful in 50s
Build all the nodes / vault01 (push) Successful in 1m9s
Build all the nodes / web03 (push) Successful in 1m1s
Build the shell / build-shell (push) Successful in 30s
Build all the nodes / zulip01 (push) Successful in 53s
Build all the nodes / krz01 (push) Successful in 1m54s
Build all the nodes / web01 (push) Successful in 1m15s
fix(dgn-monitoring): Increase logs upload timeout
2025-06-20 17:13:05 +02:00

104 lines
2.3 KiB
Nix

# SPDX-FileCopyrightText: 2024 Tom Hubrecht <tom.hubrecht@dgnum.eu>
#
# SPDX-License-Identifier: EUPL-1.2
{
config,
lib,
pkgs,
meta,
name,
nodeMeta,
...
}:
let
inherit (lib)
filterAttrs
mapAttrs
mapAttrsToList
mkDefault
mkEnableOption
mkForce
mkIf
mkOption
;
inherit (lib.types) attrsOf;
cfg = config.dgn-monitoring;
in
{
imports = [ ./exporters.nix ];
options.dgn-monitoring = {
enable = mkEnableOption "the DGNum monitoring system" // {
default = true;
};
scrapeConfigs = mkOption {
type = attrsOf (pkgs.formats.yaml { }).type;
description = ''
Specifications of `scrape_config` sections.
'';
};
};
config = mkIf cfg.enable {
dgn-monitoring.scrapeConfigs =
mapAttrs
(_: cfg: {
static_configs = mkDefault [ { targets = [ "127.0.0.1:${builtins.toString cfg.port}" ]; } ];
})
(
filterAttrs (
name: cfg:
!(builtins.elem name [
"assertions"
"warnings"
"blackbox"
"unifi-poller"
"domain"
"minio"
"idrac"
"pve"
"tor"
])
&& cfg.enable
) config.services.prometheus.exporters
);
services.vmagent = {
enable = true;
flags = {
"remoteWrite.url" = "http://${meta.network.storage01.netbirdIp}:8428/api/v1/write";
"remoteWrite.label" = "node=${name}";
};
prometheusConfig = {
scrape_configs = mapAttrsToList (job_name: value: value // { inherit job_name; }) cfg.scrapeConfigs;
global = {
scrape_interval = "15s";
external_labels.hostname = "${name}.${nodeMeta.site}.infra.dgnum.eu";
};
};
};
services.journald.upload = {
enable = true;
settings = {
Upload = {
URL = "http://${meta.network.storage01.netbirdIp}:9428/insert/journald";
# Wait 5 minutes before exiting when the remote is not found
NetworkTimeoutSec = "30min";
};
};
};
# Don't restart too often to reduce e-mail notifications when the network or the database is down
systemd.services.systemd-journal-upload.serviceConfig.RestartSec = mkForce 60;
};
}