Add daily ZFS health check with ntfy alerts and introduce ntfy role
Add a zfs-alerts module that runs a daily health check on ZFS machines, sending detailed ntfy notifications for degraded pools, data errors, or drive errors. Introduce an "ntfy" system role to decouple ntfy alerting from the server/personal roles, and assign it to all machines.
This commit is contained in:
@@ -7,6 +7,7 @@
|
|||||||
./flakes.nix
|
./flakes.nix
|
||||||
./auto-update.nix
|
./auto-update.nix
|
||||||
./ntfy-alerts.nix
|
./ntfy-alerts.nix
|
||||||
|
./zfs-alerts.nix
|
||||||
./shell.nix
|
./shell.nix
|
||||||
./network
|
./network
|
||||||
./boot
|
./boot
|
||||||
|
|||||||
@@ -18,7 +18,7 @@ in
|
|||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
config = lib.mkIf (config.thisMachine.hasRole."server" || config.thisMachine.hasRole."personal") {
|
config = lib.mkIf config.thisMachine.hasRole."ntfy" {
|
||||||
age.secrets.ntfy-token.file = ../secrets/ntfy-token.age;
|
age.secrets.ntfy-token.file = ../secrets/ntfy-token.age;
|
||||||
|
|
||||||
systemd.services."ntfy-failure@" = {
|
systemd.services."ntfy-failure@" = {
|
||||||
|
|||||||
87
common/zfs-alerts.nix
Normal file
87
common/zfs-alerts.nix
Normal file
@@ -0,0 +1,87 @@
|
|||||||
|
{ config, lib, pkgs, ... }:
|
||||||
|
|
||||||
|
let
|
||||||
|
cfg = config.ntfy-alerts;
|
||||||
|
hasZfs = config.boot.supportedFilesystems.zfs or false;
|
||||||
|
hasNtfy = config.thisMachine.hasRole."ntfy";
|
||||||
|
|
||||||
|
checkScript = pkgs.writeShellScript "zfs-health-check" ''
|
||||||
|
PATH="${lib.makeBinPath [ pkgs.zfs pkgs.coreutils pkgs.gawk pkgs.curl ]}"
|
||||||
|
|
||||||
|
unhealthy=""
|
||||||
|
|
||||||
|
# Check pool health status
|
||||||
|
while IFS=$'\t' read -r pool state; do
|
||||||
|
if [ "$state" != "ONLINE" ]; then
|
||||||
|
unhealthy="$unhealthy"$'\n'"Pool '$pool' is $state"
|
||||||
|
fi
|
||||||
|
done < <(zpool list -H -o name,health)
|
||||||
|
|
||||||
|
# Check for errors (read, write, checksum) on any vdev
|
||||||
|
while IFS=$'\t' read -r pool errors; do
|
||||||
|
if [ "$errors" != "No known data errors" ] && [ -n "$errors" ]; then
|
||||||
|
unhealthy="$unhealthy"$'\n'"Pool '$pool' has errors: $errors"
|
||||||
|
fi
|
||||||
|
done < <(zpool status -x 2>/dev/null | awk '
|
||||||
|
/pool:/ { pool=$2 }
|
||||||
|
/errors:/ { sub(/^[[:space:]]*errors: /, ""); print pool "\t" $0 }
|
||||||
|
')
|
||||||
|
|
||||||
|
# Check for any drives with non-zero error counts
|
||||||
|
drive_errors=$(zpool status 2>/dev/null | awk '
|
||||||
|
/DEGRADED|FAULTED|OFFLINE|UNAVAIL|REMOVED/ && !/pool:/ && !/state:/ {
|
||||||
|
print " " $0
|
||||||
|
}
|
||||||
|
/[0-9]+[[:space:]]+[0-9]+[[:space:]]+[0-9]+/ {
|
||||||
|
if ($3 > 0 || $4 > 0 || $5 > 0) {
|
||||||
|
print " " $1 " (read:" $3 " write:" $4 " cksum:" $5 ")"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
')
|
||||||
|
if [ -n "$drive_errors" ]; then
|
||||||
|
unhealthy="$unhealthy"$'\n'"Device errors:"$'\n'"$drive_errors"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ -n "$unhealthy" ]; then
|
||||||
|
message="ZFS health check failed on ${config.networking.hostName}:$unhealthy"
|
||||||
|
|
||||||
|
curl \
|
||||||
|
--fail --silent --show-error \
|
||||||
|
--max-time 30 --retry 3 \
|
||||||
|
-H "Authorization: Bearer $NTFY_TOKEN" \
|
||||||
|
-H "Title: ZFS issue on ${config.networking.hostName}" \
|
||||||
|
-H "Priority: urgent" \
|
||||||
|
-H "Tags: warning" \
|
||||||
|
-d "$message" \
|
||||||
|
"${cfg.serverUrl}/${cfg.topic}"
|
||||||
|
|
||||||
|
echo "$message" >&2
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "All ZFS pools healthy"
|
||||||
|
'';
|
||||||
|
in
|
||||||
|
{
|
||||||
|
config = lib.mkIf (hasZfs && hasNtfy) {
|
||||||
|
systemd.services.zfs-health-check = {
|
||||||
|
description = "Check ZFS pool health and alert on issues";
|
||||||
|
wants = [ "network-online.target" ];
|
||||||
|
after = [ "network-online.target" "zfs.target" ];
|
||||||
|
serviceConfig = {
|
||||||
|
Type = "oneshot";
|
||||||
|
EnvironmentFile = "/run/agenix/ntfy-token";
|
||||||
|
ExecStart = checkScript;
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
systemd.timers.zfs-health-check = {
|
||||||
|
description = "Periodic ZFS health check";
|
||||||
|
wantedBy = [ "timers.target" ];
|
||||||
|
timerConfig = {
|
||||||
|
OnCalendar = "daily";
|
||||||
|
Persistent = true;
|
||||||
|
RandomizedDelaySec = "1h";
|
||||||
|
};
|
||||||
|
};
|
||||||
|
};
|
||||||
|
}
|
||||||
@@ -8,6 +8,7 @@
|
|||||||
systemRoles = [
|
systemRoles = [
|
||||||
"personal"
|
"personal"
|
||||||
"dns-challenge"
|
"dns-challenge"
|
||||||
|
"ntfy"
|
||||||
];
|
];
|
||||||
|
|
||||||
hostKey = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAID/Df5lG07Il7fizEgZR/T9bMlR0joESRJ7cqM9BkOyP";
|
hostKey = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAID/Df5lG07Il7fizEgZR/T9bMlR0joESRJ7cqM9BkOyP";
|
||||||
|
|||||||
@@ -7,6 +7,7 @@
|
|||||||
|
|
||||||
systemRoles = [
|
systemRoles = [
|
||||||
"personal"
|
"personal"
|
||||||
|
"ntfy"
|
||||||
];
|
];
|
||||||
|
|
||||||
hostKey = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIEQi3q8jU6vRruExAL60J7GFO1gS8HsmXVJuKRT4ljrG";
|
hostKey = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIEQi3q8jU6vRruExAL60J7GFO1gS8HsmXVJuKRT4ljrG";
|
||||||
|
|||||||
@@ -15,6 +15,7 @@
|
|||||||
"dailybot"
|
"dailybot"
|
||||||
"gitea"
|
"gitea"
|
||||||
"librechat"
|
"librechat"
|
||||||
|
"ntfy"
|
||||||
];
|
];
|
||||||
|
|
||||||
hostKey = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIMBBlTAIp38RhErU1wNNV5MBeb+WGH0mhF/dxh5RsAXN";
|
hostKey = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIMBBlTAIp38RhErU1wNNV5MBeb+WGH0mhF/dxh5RsAXN";
|
||||||
|
|||||||
@@ -18,6 +18,7 @@
|
|||||||
"linkwarden"
|
"linkwarden"
|
||||||
"outline"
|
"outline"
|
||||||
"dns-challenge"
|
"dns-challenge"
|
||||||
|
"ntfy"
|
||||||
];
|
];
|
||||||
|
|
||||||
hostKey = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIAwiXcUFtAvZCayhu4+AIcF+Ktrdgv9ee/mXSIhJbp4q";
|
hostKey = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIAwiXcUFtAvZCayhu4+AIcF+Ktrdgv9ee/mXSIhJbp4q";
|
||||||
|
|||||||
@@ -8,6 +8,7 @@
|
|||||||
systemRoles = [
|
systemRoles = [
|
||||||
"personal"
|
"personal"
|
||||||
"media-center"
|
"media-center"
|
||||||
|
"ntfy"
|
||||||
];
|
];
|
||||||
|
|
||||||
hostKey = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIHvdC1EiLqSNVmk5L1p7cWRIrrlelbK+NMj6tEBrwqIq";
|
hostKey = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIHvdC1EiLqSNVmk5L1p7cWRIrrlelbK+NMj6tEBrwqIq";
|
||||||
|
|||||||
Reference in New Issue
Block a user