Add ntfy ssh login alerts. Include systemd service logs with service errors
All checks were successful
Check Flake / check-flake (push) Successful in 3m34s
All checks were successful
Check Flake / check-flake (push) Successful in 3m34s
This commit is contained in:
27
common/ntfy/default.nix
Normal file
27
common/ntfy/default.nix
Normal file
@@ -0,0 +1,27 @@
|
||||
{ config, lib, ... }:
|
||||
|
||||
{
|
||||
imports = [
|
||||
./service-failure.nix
|
||||
./ssh-login.nix
|
||||
./zfs.nix
|
||||
];
|
||||
|
||||
options.ntfy-alerts = {
|
||||
serverUrl = lib.mkOption {
|
||||
type = lib.types.str;
|
||||
default = "https://ntfy.neet.dev";
|
||||
description = "Base URL of the ntfy server.";
|
||||
};
|
||||
|
||||
curlExtraArgs = lib.mkOption {
|
||||
type = lib.types.str;
|
||||
default = "";
|
||||
description = "Extra arguments to pass to curl (e.g. --proxy http://host:port).";
|
||||
};
|
||||
};
|
||||
|
||||
config = lib.mkIf config.thisMachine.hasRole."ntfy" {
|
||||
age.secrets.ntfy-token.file = ../../secrets/ntfy-token.age;
|
||||
};
|
||||
}
|
||||
48
common/ntfy/service-failure.nix
Normal file
48
common/ntfy/service-failure.nix
Normal file
@@ -0,0 +1,48 @@
|
||||
{ config, lib, pkgs, ... }:
|
||||
|
||||
let
|
||||
cfg = config.ntfy-alerts;
|
||||
in
|
||||
{
|
||||
config = lib.mkIf config.thisMachine.hasRole."ntfy" {
|
||||
systemd.services."ntfy-failure@" = {
|
||||
description = "Send ntfy alert for failed unit %i";
|
||||
wants = [ "network-online.target" ];
|
||||
after = [ "network-online.target" ];
|
||||
serviceConfig = {
|
||||
Type = "oneshot";
|
||||
EnvironmentFile = "/run/agenix/ntfy-token";
|
||||
ExecStart = "${pkgs.writeShellScript "ntfy-failure-notify" ''
|
||||
unit="$1"
|
||||
logfile=$(mktemp)
|
||||
trap 'rm -f "$logfile"' EXIT
|
||||
${pkgs.systemd}/bin/journalctl -u "$unit" -n 50 --no-pager -o short > "$logfile" 2>/dev/null \
|
||||
|| echo "(no logs available)" > "$logfile"
|
||||
${lib.getExe pkgs.curl} \
|
||||
-T "$logfile" \
|
||||
--fail --silent --show-error \
|
||||
--max-time 30 --retry 3 \
|
||||
${cfg.curlExtraArgs} \
|
||||
-H "Authorization: Bearer $NTFY_TOKEN" \
|
||||
-H "Title: Service failure on ${config.networking.hostName}" \
|
||||
-H "Priority: high" \
|
||||
-H "Tags: rotating_light" \
|
||||
-H "Message: Unit $unit failed at $(date +%c)" \
|
||||
-H "Filename: $unit.log" \
|
||||
"${cfg.serverUrl}/service-failures"
|
||||
''} %i";
|
||||
};
|
||||
};
|
||||
|
||||
# Apply OnFailure to all services via a systemd drop-in
|
||||
systemd.packages = [
|
||||
(pkgs.runCommand "ntfy-on-failure-dropin" { } ''
|
||||
mkdir -p $out/lib/systemd/system/service.d
|
||||
cat > $out/lib/systemd/system/service.d/ntfy-on-failure.conf <<'EOF'
|
||||
[Unit]
|
||||
OnFailure=ntfy-failure@%p.service
|
||||
EOF
|
||||
'')
|
||||
];
|
||||
};
|
||||
}
|
||||
36
common/ntfy/ssh-login.nix
Normal file
36
common/ntfy/ssh-login.nix
Normal file
@@ -0,0 +1,36 @@
|
||||
{ config, lib, pkgs, ... }:
|
||||
|
||||
let
|
||||
cfg = config.ntfy-alerts;
|
||||
|
||||
notifyScript = pkgs.writeShellScript "ssh-login-notify" ''
|
||||
# Only notify on session open, not close
|
||||
[ "$PAM_TYPE" = "open_session" ] || exit 0
|
||||
|
||||
. /run/agenix/ntfy-token
|
||||
|
||||
# Send notification in background so login isn't delayed
|
||||
${lib.getExe pkgs.curl} \
|
||||
--fail --silent --show-error \
|
||||
--max-time 10 --retry 1 \
|
||||
${cfg.curlExtraArgs} \
|
||||
-H "Authorization: Bearer $NTFY_TOKEN" \
|
||||
-H "Title: SSH login on ${config.networking.hostName}" \
|
||||
-H "Tags: key" \
|
||||
-d "$PAM_USER from $PAM_RHOST at $(date +%c)" \
|
||||
"${cfg.serverUrl}/ssh-logins" &
|
||||
'';
|
||||
in
|
||||
{
|
||||
config = lib.mkIf config.thisMachine.hasRole."ntfy" {
|
||||
security.pam.services.sshd.rules.session.ntfy-login = {
|
||||
order = 99999;
|
||||
control = "optional";
|
||||
modulePath = "${pkgs.pam}/lib/security/pam_exec.so";
|
||||
args = [
|
||||
"quiet"
|
||||
(toString notifyScript)
|
||||
];
|
||||
};
|
||||
};
|
||||
}
|
||||
87
common/ntfy/zfs.nix
Normal file
87
common/ntfy/zfs.nix
Normal file
@@ -0,0 +1,87 @@
|
||||
{ config, lib, pkgs, ... }:
|
||||
|
||||
let
|
||||
cfg = config.ntfy-alerts;
|
||||
hasZfs = config.boot.supportedFilesystems.zfs or false;
|
||||
hasNtfy = config.thisMachine.hasRole."ntfy";
|
||||
|
||||
checkScript = pkgs.writeShellScript "zfs-health-check" ''
|
||||
PATH="${lib.makeBinPath [ pkgs.zfs pkgs.coreutils pkgs.gawk pkgs.curl ]}"
|
||||
|
||||
unhealthy=""
|
||||
|
||||
# Check pool health status
|
||||
while IFS=$'\t' read -r pool state; do
|
||||
if [ "$state" != "ONLINE" ]; then
|
||||
unhealthy="$unhealthy"$'\n'"Pool '$pool' is $state"
|
||||
fi
|
||||
done < <(zpool list -H -o name,health)
|
||||
|
||||
# Check for errors (read, write, checksum) on any vdev
|
||||
while IFS=$'\t' read -r pool errors; do
|
||||
if [ "$errors" != "No known data errors" ] && [ -n "$errors" ]; then
|
||||
unhealthy="$unhealthy"$'\n'"Pool '$pool' has errors: $errors"
|
||||
fi
|
||||
done < <(zpool status -x 2>/dev/null | awk '
|
||||
/pool:/ { pool=$2 }
|
||||
/errors:/ { sub(/^[[:space:]]*errors: /, ""); print pool "\t" $0 }
|
||||
')
|
||||
|
||||
# Check for any drives with non-zero error counts
|
||||
drive_errors=$(zpool status 2>/dev/null | awk '
|
||||
/DEGRADED|FAULTED|OFFLINE|UNAVAIL|REMOVED/ && !/pool:/ && !/state:/ {
|
||||
print " " $0
|
||||
}
|
||||
/[0-9]+[[:space:]]+[0-9]+[[:space:]]+[0-9]+/ {
|
||||
if ($3 > 0 || $4 > 0 || $5 > 0) {
|
||||
print " " $1 " (read:" $3 " write:" $4 " cksum:" $5 ")"
|
||||
}
|
||||
}
|
||||
')
|
||||
if [ -n "$drive_errors" ]; then
|
||||
unhealthy="$unhealthy"$'\n'"Device errors:"$'\n'"$drive_errors"
|
||||
fi
|
||||
|
||||
if [ -n "$unhealthy" ]; then
|
||||
message="ZFS health check failed on ${config.networking.hostName}:$unhealthy"
|
||||
|
||||
curl \
|
||||
--fail --silent --show-error \
|
||||
--max-time 30 --retry 3 \
|
||||
-H "Authorization: Bearer $NTFY_TOKEN" \
|
||||
-H "Title: ZFS issue on ${config.networking.hostName}" \
|
||||
-H "Priority: urgent" \
|
||||
-H "Tags: warning" \
|
||||
-d "$message" \
|
||||
"${cfg.serverUrl}/service-failures"
|
||||
|
||||
echo "$message" >&2
|
||||
fi
|
||||
|
||||
echo "All ZFS pools healthy"
|
||||
'';
|
||||
in
|
||||
{
|
||||
config = lib.mkIf (hasZfs && hasNtfy) {
|
||||
systemd.services.zfs-health-check = {
|
||||
description = "Check ZFS pool health and alert on issues";
|
||||
wants = [ "network-online.target" ];
|
||||
after = [ "network-online.target" "zfs.target" ];
|
||||
serviceConfig = {
|
||||
Type = "oneshot";
|
||||
EnvironmentFile = "/run/agenix/ntfy-token";
|
||||
ExecStart = checkScript;
|
||||
};
|
||||
};
|
||||
|
||||
systemd.timers.zfs-health-check = {
|
||||
description = "Periodic ZFS health check";
|
||||
wantedBy = [ "timers.target" ];
|
||||
timerConfig = {
|
||||
OnCalendar = "daily";
|
||||
Persistent = true;
|
||||
RandomizedDelaySec = "1h";
|
||||
};
|
||||
};
|
||||
};
|
||||
}
|
||||
Reference in New Issue
Block a user