diff --git a/flake.lock b/flake.lock index 550380e..70e52a9 100644 --- a/flake.lock +++ b/flake.lock @@ -539,11 +539,11 @@ }, "nix-secrets": { "locked": { - "lastModified": 1737899664, - "narHash": "sha256-iZpzTSERNQ5UvFfEzrBLuEmcRUGjBSal7ShtXurYq8Q=", + "lastModified": 1738685297, + "narHash": "sha256-JOv3+toYlftzBm47QF5tzaBhTbQIm1IBq1tKeQrQLyM=", "ref": "refs/heads/master", - "rev": "a9844a78dcbdc8a84679835112970d80822b113c", - "revCount": 257, + "rev": "3be1d509f9823292dd9ca6b396743fbf722bd8b9", + "revCount": 269, "type": "git", "url": "ssh://git@git.bitlab21.com/sam/nix-secrets.git" }, diff --git a/hosts/common/optional/nixos-containers/metrics-server.nix b/hosts/common/optional/nixos-containers/metrics-server.nix index 50417f4..a880109 100644 --- a/hosts/common/optional/nixos-containers/metrics-server.nix +++ b/hosts/common/optional/nixos-containers/metrics-server.nix @@ -2,11 +2,15 @@ pkgs, lib, configVars, + inputs, ... }: let containerName = "metrics-server"; containerIp = configVars.networking.addresses.metrics-server.ip; + notifybotJid = configVars.xmpp.notifybotJid; + receiverJid = configVars.xmpp.personalAccount; + dockerContainerIp = configVars.networking.addresses.docker.ip; smWorkerIp = configVars.networking.addresses.sm-worker.ip; merlinIp = configVars.networking.addresses.merlin.ip; @@ -15,6 +19,7 @@ bitcoinNode = configVars.networking.addresses.bitcoin-node.ip; postres = configVars.networking.addresses.postgres.ip; backupServer = configVars.networking.addresses.backup-server.ip; + sops-nix = inputs.sops-nix; http_endpoints = configVars.metrics-server.blackbox.http_endpoints; @@ -52,6 +57,10 @@ in { hostPath = metricsServerContainerData; isReadOnly = false; }; + "/etc/ssh/ssh_host_ed25519_key" = { + hostPath = "/etc/ssh/ssh_host_ed25519_key"; + isReadOnly = true; + }; }; config = { @@ -59,7 +68,10 @@ in { lib, config, ... - }: { + }: let + secretsDirectory = builtins.toString inputs.nix-secrets; + secretsFile = "${secretsDirectory}/secrets.yaml"; + in { networking = { defaultGateway = "${gatewayIp}"; interfaces.eth0.ipv4.addresses = [ @@ -74,14 +86,32 @@ in { config.services.prometheus.port config.services.grafana.port config.services.prometheus.exporters.blackbox.port + 9199 #xmpp listen port ]; }; useHostResolvConf = lib.mkForce false; }; + sops = { + defaultSopsFile = "${secretsFile}"; + validateSopsFiles = false; + + age = { + sshKeyPaths = ["/etc/ssh/ssh_host_ed25519_key"]; + }; + secrets = { + "software/restic-passphrase" = {}; + "software/restic-exporter-credentials" = {}; + "comms/xmpp/notifybot/password" = { + mode = "0644"; + }; + }; + }; + services.resolved.enable = true; imports = [ + sops-nix.nixosModules.sops ]; environment.systemPackages = [ @@ -89,9 +119,49 @@ in { pkgs.git ]; + services.grafana = { + enable = true; + settings.server = { + http_port = 2342; + http_addr = "0.0.0.0"; + }; + }; + + # main prometheus service services.prometheus = { enable = true; + webExternalUrl = "http://${containerIp}:9001"; port = 9001; + alertmanagers = [ + { + scheme = "http"; + path_prefix = "/"; + static_configs = [ + { + targets = [ + "0.0.0.0:9093" + ]; + } + ]; + } + ]; + ruleFiles = [ + "${pkgs.writeText + "alert_rule.yml" + '' + groups: + - name: blackbox_alert + rules: + - alert: EndpointDown + expr: probe_success{job="blackbox"} == 0 + for: 1m + labels: + severity: critical + annotations: + summary: "Endpoint {{ $labels.instance }} down" + description: "An endpoint has been down for more than 1 minute." + ''}" + ]; scrapeConfigs = [ { job_name = "node_exporter"; @@ -110,6 +180,16 @@ in { } ]; } + { + job_name = "restic-exporter"; + static_configs = [ + { + targets = [ + "0.0.0.0:8001" + ]; + } + ]; + } { job_name = "blackbox"; @@ -138,43 +218,74 @@ in { ]; }; - services.grafana = { + # setup alertmanager + services.prometheus.xmpp-alerts = { enable = true; - settings.server = { - http_port = 2342; - http_addr = "0.0.0.0"; + settings = { + jid = notifybotJid; + password_command = "cat ${config.sops.secrets."comms/xmpp/notifybot/password".path}"; + to_jid = receiverJid; + listen_address = "0.0.0.0"; + listen_port = 9199; }; }; + services.prometheus.alertmanager = { + webExternalUrl = "http://${containerIp}:9093"; + enable = true; + openFirewall = true; + port = 9093; + configText = '' + global: + resolve_timeout: 1m - services.prometheus = { - exporters = { - blackbox = { - enable = true; - configFile = pkgs.writeText "blackbox-conf.yaml" '' - modules: - http_basic: - prober: http - timeout: 5s - http: - preferred_ip_protocol: ip4 - valid_http_versions: ["HTTP/1.1", "HTTP/2"] - method: GET - # fail_if_ssl: false - # fail_if_not_ssl: true - # tls_config: - # insecure_skip_verify: true - tcp_connect: - prober: tcp - tcp: - preferred_ip_protocol: ip4 + route: + group_by: ['...'] + repeat_interval: 1h + receiver: 'xmpp-alerts' - ''; - }; - node = { - enable = true; - enabledCollectors = ["systemd"]; - port = 9002; - }; + receivers: + - name: 'xmpp-alerts' + webhook_configs: + - url: 'http://0.0.0.0:9199/alert' + ''; + }; + + # prometheus exporters + services.prometheus.exporters = { + blackbox = { + enable = true; + configFile = pkgs.writeText "blackbox-conf.yaml" '' + modules: + http_basic: + prober: http + timeout: 5s + http: + preferred_ip_protocol: ip4 + valid_http_versions: ["HTTP/1.1", "HTTP/2"] + method: GET + # fail_if_ssl: false + # fail_if_not_ssl: true + # tls_config: + # insecure_skip_verify: true + tcp_connect: + prober: tcp + tcp: + preferred_ip_protocol: ip4 + + ''; + }; + node = { + enable = true; + enabledCollectors = ["systemd"]; + port = 9002; + }; + restic = { + enable = true; + repository = ""; + environmentFile = config.sops.secrets."software/restic-exporter-credentials".path; + passwordFile = config.sops.secrets."software/restic-passphrase".path; + refreshInterval = 10800; # refresh every 3 hours + port = 8001; }; };