diff --git a/flake.lock b/flake.lock index 9d5a28c..70e52a9 100644 --- a/flake.lock +++ b/flake.lock @@ -539,11 +539,11 @@ }, "nix-secrets": { "locked": { - "lastModified": 1738358831, - "narHash": "sha256-BFkqC7xQwGpA7mYYGDBkzw9iehWao+BkR5Bp/dFicWY=", + "lastModified": 1738685297, + "narHash": "sha256-JOv3+toYlftzBm47QF5tzaBhTbQIm1IBq1tKeQrQLyM=", "ref": "refs/heads/master", - "rev": "e7311c8f523ad3ffe187efe63f6438140fa0cf45", - "revCount": 268, + "rev": "3be1d509f9823292dd9ca6b396743fbf722bd8b9", + "revCount": 269, "type": "git", "url": "ssh://git@git.bitlab21.com/sam/nix-secrets.git" }, diff --git a/hosts/common/optional/nixos-containers/metrics-server.nix b/hosts/common/optional/nixos-containers/metrics-server.nix index 148fbac..dd7c746 100644 --- a/hosts/common/optional/nixos-containers/metrics-server.nix +++ b/hosts/common/optional/nixos-containers/metrics-server.nix @@ -8,6 +8,9 @@ containerName = "metrics-server"; containerIp = configVars.networking.addresses.metrics-server.ip; + notifybotJid = configVars.xmpp.notifybotJid; + receiverJid = configVars.xmpp.personalAccount; + dockerContainerIp = configVars.networking.addresses.docker.ip; smWorkerIp = configVars.networking.addresses.sm-worker.ip; merlinIp = configVars.networking.addresses.merlin.ip; @@ -83,6 +86,7 @@ in { config.services.prometheus.port config.services.grafana.port config.services.prometheus.exporters.blackbox.port + 9199 #xmpp listen port ]; }; useHostResolvConf = lib.mkForce false; @@ -98,6 +102,9 @@ in { secrets = { "software/restic-passphrase" = {}; "software/restic-exporter-credentials" = {}; + "comms/xmpp/notifybot/password" = { + mode = "0644"; + }; }; }; @@ -112,9 +119,48 @@ in { pkgs.git ]; + services.grafana = { + enable = true; + settings.server = { + http_port = 2342; + http_addr = "0.0.0.0"; + }; + }; + + # main prometheus service services.prometheus = { enable = true; port = 9001; + alertmanagers = [ + { + scheme = "http"; + path_prefix = "/"; + static_configs = [ + { + targets = [ + "0.0.0.0:9093" + ]; + } + ]; + } + ]; + ruleFiles = [ + "${pkgs.writeText + "alert_rule.yml" + '' + groups: + - name: blackbox_alert + rules: + - alert: EndpointDown + expr: probe_success{job="blackbox"} == 0 + for: 1m + labels: + severity: critical + annotations: + summary: "Endpoint {{ $labels.instance }} down" + description: "An endpoint has been down for more than 1 minute." + ''}" + ]; scrapeConfigs = [ { job_name = "node_exporter"; @@ -171,51 +217,74 @@ in { ]; }; - services.grafana = { + # setup alertmanager + services.prometheus.xmpp-alerts = { enable = true; - settings.server = { - http_port = 2342; - http_addr = "0.0.0.0"; + settings = { + jid = notifybotJid; + password_command = "cat ${config.sops.secrets."comms/xmpp/notifybot/password".path}"; + to_jid = receiverJid; + listen_address = "0.0.0.0"; + listen_port = 9199; }; }; + services.prometheus.alertmanager = { + webExternalUrl = containerIp; + enable = true; + openFirewall = true; + port = 9093; + configText = '' + global: + resolve_timeout: 1m - services.prometheus = { - exporters = { - blackbox = { - enable = true; - configFile = pkgs.writeText "blackbox-conf.yaml" '' - modules: - http_basic: - prober: http - timeout: 5s - http: - preferred_ip_protocol: ip4 - valid_http_versions: ["HTTP/1.1", "HTTP/2"] - method: GET - # fail_if_ssl: false - # fail_if_not_ssl: true - # tls_config: - # insecure_skip_verify: true - tcp_connect: - prober: tcp - tcp: - preferred_ip_protocol: ip4 + route: + group_by: ['...'] + repeat_interval: 1h + receiver: 'xmpp-alerts' - ''; - }; - node = { - enable = true; - enabledCollectors = ["systemd"]; - port = 9002; - }; - restic = { - enable = true; - repository = ""; - environmentFile = config.sops.secrets."software/restic-exporter-credentials".path; - passwordFile = config.sops.secrets."software/restic-passphrase".path; - refreshInterval = 10800; # refresh every 3 hours - port = 8001; - }; + receivers: + - name: 'xmpp-alerts' + webhook_configs: + - url: 'http://0.0.0.0:9199/alert' + ''; + }; + + # prometheus exporters + services.prometheus.exporters = { + blackbox = { + enable = true; + configFile = pkgs.writeText "blackbox-conf.yaml" '' + modules: + http_basic: + prober: http + timeout: 5s + http: + preferred_ip_protocol: ip4 + valid_http_versions: ["HTTP/1.1", "HTTP/2"] + method: GET + # fail_if_ssl: false + # fail_if_not_ssl: true + # tls_config: + # insecure_skip_verify: true + tcp_connect: + prober: tcp + tcp: + preferred_ip_protocol: ip4 + + ''; + }; + node = { + enable = true; + enabledCollectors = ["systemd"]; + port = 9002; + }; + restic = { + enable = true; + repository = ""; + environmentFile = config.sops.secrets."software/restic-exporter-credentials".path; + passwordFile = config.sops.secrets."software/restic-passphrase".path; + refreshInterval = 10800; # refresh every 3 hours + port = 8001; }; }; diff --git a/vars/default.nix b/vars/default.nix index f6973cc..82b6969 100644 --- a/vars/default.nix +++ b/vars/default.nix @@ -4,6 +4,7 @@ networking email metrics-server + xmpp ; locations = { mediaDataMountPoint = "/media/media";