Add daily ZFS health check with ntfy alerts and introduce ntfy role

Add a zfs-alerts module that runs a daily health check on ZFS machines, sending detailed ntfy notifications for degraded pools, data errors, or drive errors. Introduce an "ntfy" system role to decouple ntfy alerting from the server/personal roles, and assign it to all machines.
2026-02-22 17:17:40 -08:00
parent 200d5a5d22
commit a697ea10ad
8 changed files with 94 additions and 1 deletions
@@ -7,6 +7,7 @@
    ./flakes.nix
    ./auto-update.nix
    ./ntfy-alerts.nix
+    ./zfs-alerts.nix
    ./shell.nix
    ./network
    ./boot
@@ -18,7 +18,7 @@ in
    };
  };

-  config = lib.mkIf (config.thisMachine.hasRole."server" || config.thisMachine.hasRole."personal") {
+  config = lib.mkIf config.thisMachine.hasRole."ntfy" {
    age.secrets.ntfy-token.file = ../secrets/ntfy-token.age;

    systemd.services."ntfy-failure@" = {
@@ -0,0 +1,87 @@
+{ config, lib, pkgs, ... }:
+
+let
+  cfg = config.ntfy-alerts;
+  hasZfs = config.boot.supportedFilesystems.zfs or false;
+  hasNtfy = config.thisMachine.hasRole."ntfy";
+
+  checkScript = pkgs.writeShellScript "zfs-health-check" ''
+    PATH="${lib.makeBinPath [ pkgs.zfs pkgs.coreutils pkgs.gawk pkgs.curl ]}"
+
+    unhealthy=""
+
+    # Check pool health status
+    while IFS=$'\t' read -r pool state; do
+      if [ "$state" != "ONLINE" ]; then
+        unhealthy="$unhealthy"$'\n'"Pool '$pool' is $state"
+      fi
+    done < <(zpool list -H -o name,health)
+
+    # Check for errors (read, write, checksum) on any vdev
+    while IFS=$'\t' read -r pool errors; do
+      if [ "$errors" != "No known data errors" ] && [ -n "$errors" ]; then
+        unhealthy="$unhealthy"$'\n'"Pool '$pool' has errors: $errors"
+      fi
+    done < <(zpool status -x 2>/dev/null | awk '
+      /pool:/ { pool=$2 }
+      /errors:/ { sub(/^[[:space:]]*errors: /, ""); print pool "\t" $0 }
+    ')
+
+    # Check for any drives with non-zero error counts
+    drive_errors=$(zpool status 2>/dev/null | awk '
+      /DEGRADED|FAULTED|OFFLINE|UNAVAIL|REMOVED/ && !/pool:/ && !/state:/ {
+        print "  " $0
+      }
+      /[0-9]+[[:space:]]+[0-9]+[[:space:]]+[0-9]+/ {
+        if ($3 > 0 || $4 > 0 || $5 > 0) {
+          print "  " $1 " (read:" $3 " write:" $4 " cksum:" $5 ")"
+        }
+      }
+    ')
+    if [ -n "$drive_errors" ]; then
+      unhealthy="$unhealthy"$'\n'"Device errors:"$'\n'"$drive_errors"
+    fi
+
+    if [ -n "$unhealthy" ]; then
+      message="ZFS health check failed on ${config.networking.hostName}:$unhealthy"
+
+      curl \
+        --fail --silent --show-error \
+        --max-time 30 --retry 3 \
+        -H "Authorization: Bearer $NTFY_TOKEN" \
+        -H "Title: ZFS issue on ${config.networking.hostName}" \
+        -H "Priority: urgent" \
+        -H "Tags: warning" \
+        -d "$message" \
+        "${cfg.serverUrl}/${cfg.topic}"
+
+      echo "$message" >&2
+    fi
+
+    echo "All ZFS pools healthy"
+  '';
+in
+{
+  config = lib.mkIf (hasZfs && hasNtfy) {
+    systemd.services.zfs-health-check = {
+      description = "Check ZFS pool health and alert on issues";
+      wants = [ "network-online.target" ];
+      after = [ "network-online.target" "zfs.target" ];
+      serviceConfig = {
+        Type = "oneshot";
+        EnvironmentFile = "/run/agenix/ntfy-token";
+        ExecStart = checkScript;
+      };
+    };
+
+    systemd.timers.zfs-health-check = {
+      description = "Periodic ZFS health check";
+      wantedBy = [ "timers.target" ];
+      timerConfig = {
+        OnCalendar = "daily";
+        Persistent = true;
+        RandomizedDelaySec = "1h";
+      };
+    };
+  };
+}