setup prometheus alertmanager
This commit is contained in:
parent
f7876d08f6
commit
996e51f56e
3 changed files with 114 additions and 44 deletions
8
flake.lock
generated
8
flake.lock
generated
|
@ -539,11 +539,11 @@
|
|||
},
|
||||
"nix-secrets": {
|
||||
"locked": {
|
||||
"lastModified": 1738358831,
|
||||
"narHash": "sha256-BFkqC7xQwGpA7mYYGDBkzw9iehWao+BkR5Bp/dFicWY=",
|
||||
"lastModified": 1738685297,
|
||||
"narHash": "sha256-JOv3+toYlftzBm47QF5tzaBhTbQIm1IBq1tKeQrQLyM=",
|
||||
"ref": "refs/heads/master",
|
||||
"rev": "e7311c8f523ad3ffe187efe63f6438140fa0cf45",
|
||||
"revCount": 268,
|
||||
"rev": "3be1d509f9823292dd9ca6b396743fbf722bd8b9",
|
||||
"revCount": 269,
|
||||
"type": "git",
|
||||
"url": "ssh://git@git.bitlab21.com/sam/nix-secrets.git"
|
||||
},
|
||||
|
|
|
@ -8,6 +8,9 @@
|
|||
containerName = "metrics-server";
|
||||
containerIp = configVars.networking.addresses.metrics-server.ip;
|
||||
|
||||
notifybotJid = configVars.xmpp.notifybotJid;
|
||||
receiverJid = configVars.xmpp.personalAccount;
|
||||
|
||||
dockerContainerIp = configVars.networking.addresses.docker.ip;
|
||||
smWorkerIp = configVars.networking.addresses.sm-worker.ip;
|
||||
merlinIp = configVars.networking.addresses.merlin.ip;
|
||||
|
@ -83,6 +86,7 @@ in {
|
|||
config.services.prometheus.port
|
||||
config.services.grafana.port
|
||||
config.services.prometheus.exporters.blackbox.port
|
||||
9199 #xmpp listen port
|
||||
];
|
||||
};
|
||||
useHostResolvConf = lib.mkForce false;
|
||||
|
@ -98,6 +102,9 @@ in {
|
|||
secrets = {
|
||||
"software/restic-passphrase" = {};
|
||||
"software/restic-exporter-credentials" = {};
|
||||
"comms/xmpp/notifybot/password" = {
|
||||
mode = "0644";
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
|
@ -112,9 +119,48 @@ in {
|
|||
pkgs.git
|
||||
];
|
||||
|
||||
services.grafana = {
|
||||
enable = true;
|
||||
settings.server = {
|
||||
http_port = 2342;
|
||||
http_addr = "0.0.0.0";
|
||||
};
|
||||
};
|
||||
|
||||
# main prometheus service
|
||||
services.prometheus = {
|
||||
enable = true;
|
||||
port = 9001;
|
||||
alertmanagers = [
|
||||
{
|
||||
scheme = "http";
|
||||
path_prefix = "/";
|
||||
static_configs = [
|
||||
{
|
||||
targets = [
|
||||
"0.0.0.0:9093"
|
||||
];
|
||||
}
|
||||
];
|
||||
}
|
||||
];
|
||||
ruleFiles = [
|
||||
"${pkgs.writeText
|
||||
"alert_rule.yml"
|
||||
''
|
||||
groups:
|
||||
- name: blackbox_alert
|
||||
rules:
|
||||
- alert: EndpointDown
|
||||
expr: probe_success{job="blackbox"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Endpoint {{ $labels.instance }} down"
|
||||
description: "An endpoint has been down for more than 1 minute."
|
||||
''}"
|
||||
];
|
||||
scrapeConfigs = [
|
||||
{
|
||||
job_name = "node_exporter";
|
||||
|
@ -171,51 +217,74 @@ in {
|
|||
];
|
||||
};
|
||||
|
||||
services.grafana = {
|
||||
# setup alertmanager
|
||||
services.prometheus.xmpp-alerts = {
|
||||
enable = true;
|
||||
settings.server = {
|
||||
http_port = 2342;
|
||||
http_addr = "0.0.0.0";
|
||||
settings = {
|
||||
jid = notifybotJid;
|
||||
password_command = "cat ${config.sops.secrets."comms/xmpp/notifybot/password".path}";
|
||||
to_jid = receiverJid;
|
||||
listen_address = "0.0.0.0";
|
||||
listen_port = 9199;
|
||||
};
|
||||
};
|
||||
services.prometheus.alertmanager = {
|
||||
webExternalUrl = containerIp;
|
||||
enable = true;
|
||||
openFirewall = true;
|
||||
port = 9093;
|
||||
configText = ''
|
||||
global:
|
||||
resolve_timeout: 1m
|
||||
|
||||
services.prometheus = {
|
||||
exporters = {
|
||||
blackbox = {
|
||||
enable = true;
|
||||
configFile = pkgs.writeText "blackbox-conf.yaml" ''
|
||||
modules:
|
||||
http_basic:
|
||||
prober: http
|
||||
timeout: 5s
|
||||
http:
|
||||
preferred_ip_protocol: ip4
|
||||
valid_http_versions: ["HTTP/1.1", "HTTP/2"]
|
||||
method: GET
|
||||
# fail_if_ssl: false
|
||||
# fail_if_not_ssl: true
|
||||
# tls_config:
|
||||
# insecure_skip_verify: true
|
||||
tcp_connect:
|
||||
prober: tcp
|
||||
tcp:
|
||||
preferred_ip_protocol: ip4
|
||||
route:
|
||||
group_by: ['...']
|
||||
repeat_interval: 1h
|
||||
receiver: 'xmpp-alerts'
|
||||
|
||||
'';
|
||||
};
|
||||
node = {
|
||||
enable = true;
|
||||
enabledCollectors = ["systemd"];
|
||||
port = 9002;
|
||||
};
|
||||
restic = {
|
||||
enable = true;
|
||||
repository = "";
|
||||
environmentFile = config.sops.secrets."software/restic-exporter-credentials".path;
|
||||
passwordFile = config.sops.secrets."software/restic-passphrase".path;
|
||||
refreshInterval = 10800; # refresh every 3 hours
|
||||
port = 8001;
|
||||
};
|
||||
receivers:
|
||||
- name: 'xmpp-alerts'
|
||||
webhook_configs:
|
||||
- url: 'http://0.0.0.0:9199/alert'
|
||||
'';
|
||||
};
|
||||
|
||||
# prometheus exporters
|
||||
services.prometheus.exporters = {
|
||||
blackbox = {
|
||||
enable = true;
|
||||
configFile = pkgs.writeText "blackbox-conf.yaml" ''
|
||||
modules:
|
||||
http_basic:
|
||||
prober: http
|
||||
timeout: 5s
|
||||
http:
|
||||
preferred_ip_protocol: ip4
|
||||
valid_http_versions: ["HTTP/1.1", "HTTP/2"]
|
||||
method: GET
|
||||
# fail_if_ssl: false
|
||||
# fail_if_not_ssl: true
|
||||
# tls_config:
|
||||
# insecure_skip_verify: true
|
||||
tcp_connect:
|
||||
prober: tcp
|
||||
tcp:
|
||||
preferred_ip_protocol: ip4
|
||||
|
||||
'';
|
||||
};
|
||||
node = {
|
||||
enable = true;
|
||||
enabledCollectors = ["systemd"];
|
||||
port = 9002;
|
||||
};
|
||||
restic = {
|
||||
enable = true;
|
||||
repository = "";
|
||||
environmentFile = config.sops.secrets."software/restic-exporter-credentials".path;
|
||||
passwordFile = config.sops.secrets."software/restic-passphrase".path;
|
||||
refreshInterval = 10800; # refresh every 3 hours
|
||||
port = 8001;
|
||||
};
|
||||
};
|
||||
|
||||
|
|
|
@ -4,6 +4,7 @@
|
|||
networking
|
||||
email
|
||||
metrics-server
|
||||
xmpp
|
||||
;
|
||||
locations = {
|
||||
mediaDataMountPoint = "/media/media";
|
||||
|
|
Loading…
Add table
Reference in a new issue