diff --git a/common/default.nix b/common/default.nix index 33a30f7..f72be2b 100644 --- a/common/default.nix +++ b/common/default.nix @@ -7,6 +7,7 @@ ./flakes.nix ./auto-update.nix ./ntfy-alerts.nix + ./zfs-alerts.nix ./shell.nix ./network ./boot diff --git a/common/ntfy-alerts.nix b/common/ntfy-alerts.nix index e5a3206..aa92897 100644 --- a/common/ntfy-alerts.nix +++ b/common/ntfy-alerts.nix @@ -18,7 +18,7 @@ in }; }; - config = lib.mkIf (config.thisMachine.hasRole."server" || config.thisMachine.hasRole."personal") { + config = lib.mkIf config.thisMachine.hasRole."ntfy" { age.secrets.ntfy-token.file = ../secrets/ntfy-token.age; systemd.services."ntfy-failure@" = { diff --git a/common/zfs-alerts.nix b/common/zfs-alerts.nix new file mode 100644 index 0000000..c000975 --- /dev/null +++ b/common/zfs-alerts.nix @@ -0,0 +1,87 @@ +{ config, lib, pkgs, ... }: + +let + cfg = config.ntfy-alerts; + hasZfs = config.boot.supportedFilesystems.zfs or false; + hasNtfy = config.thisMachine.hasRole."ntfy"; + + checkScript = pkgs.writeShellScript "zfs-health-check" '' + PATH="${lib.makeBinPath [ pkgs.zfs pkgs.coreutils pkgs.gawk pkgs.curl ]}" + + unhealthy="" + + # Check pool health status + while IFS=$'\t' read -r pool state; do + if [ "$state" != "ONLINE" ]; then + unhealthy="$unhealthy"$'\n'"Pool '$pool' is $state" + fi + done < <(zpool list -H -o name,health) + + # Check for errors (read, write, checksum) on any vdev + while IFS=$'\t' read -r pool errors; do + if [ "$errors" != "No known data errors" ] && [ -n "$errors" ]; then + unhealthy="$unhealthy"$'\n'"Pool '$pool' has errors: $errors" + fi + done < <(zpool status -x 2>/dev/null | awk ' + /pool:/ { pool=$2 } + /errors:/ { sub(/^[[:space:]]*errors: /, ""); print pool "\t" $0 } + ') + + # Check for any drives with non-zero error counts + drive_errors=$(zpool status 2>/dev/null | awk ' + /DEGRADED|FAULTED|OFFLINE|UNAVAIL|REMOVED/ && !/pool:/ && !/state:/ { + print " " $0 + } + /[0-9]+[[:space:]]+[0-9]+[[:space:]]+[0-9]+/ { + if ($3 > 0 || $4 > 0 || $5 > 0) { + print " " $1 " (read:" $3 " write:" $4 " cksum:" $5 ")" + } + } + ') + if [ -n "$drive_errors" ]; then + unhealthy="$unhealthy"$'\n'"Device errors:"$'\n'"$drive_errors" + fi + + if [ -n "$unhealthy" ]; then + message="ZFS health check failed on ${config.networking.hostName}:$unhealthy" + + curl \ + --fail --silent --show-error \ + --max-time 30 --retry 3 \ + -H "Authorization: Bearer $NTFY_TOKEN" \ + -H "Title: ZFS issue on ${config.networking.hostName}" \ + -H "Priority: urgent" \ + -H "Tags: warning" \ + -d "$message" \ + "${cfg.serverUrl}/${cfg.topic}" + + echo "$message" >&2 + fi + + echo "All ZFS pools healthy" + ''; +in +{ + config = lib.mkIf (hasZfs && hasNtfy) { + systemd.services.zfs-health-check = { + description = "Check ZFS pool health and alert on issues"; + wants = [ "network-online.target" ]; + after = [ "network-online.target" "zfs.target" ]; + serviceConfig = { + Type = "oneshot"; + EnvironmentFile = "/run/agenix/ntfy-token"; + ExecStart = checkScript; + }; + }; + + systemd.timers.zfs-health-check = { + description = "Periodic ZFS health check"; + wantedBy = [ "timers.target" ]; + timerConfig = { + OnCalendar = "daily"; + Persistent = true; + RandomizedDelaySec = "1h"; + }; + }; + }; +} diff --git a/machines/fry/properties.nix b/machines/fry/properties.nix index c5eef13..e8a76dd 100644 --- a/machines/fry/properties.nix +++ b/machines/fry/properties.nix @@ -8,6 +8,7 @@ systemRoles = [ "personal" "dns-challenge" + "ntfy" ]; hostKey = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAID/Df5lG07Il7fizEgZR/T9bMlR0joESRJ7cqM9BkOyP"; diff --git a/machines/howl/properties.nix b/machines/howl/properties.nix index 5c7f178..cbf8a27 100644 --- a/machines/howl/properties.nix +++ b/machines/howl/properties.nix @@ -7,6 +7,7 @@ systemRoles = [ "personal" + "ntfy" ]; hostKey = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIEQi3q8jU6vRruExAL60J7GFO1gS8HsmXVJuKRT4ljrG"; diff --git a/machines/ponyo/properties.nix b/machines/ponyo/properties.nix index 7662160..c1b8403 100644 --- a/machines/ponyo/properties.nix +++ b/machines/ponyo/properties.nix @@ -15,6 +15,7 @@ "dailybot" "gitea" "librechat" + "ntfy" ]; hostKey = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIMBBlTAIp38RhErU1wNNV5MBeb+WGH0mhF/dxh5RsAXN"; diff --git a/machines/storage/s0/properties.nix b/machines/storage/s0/properties.nix index dc857da..1aaa34a 100644 --- a/machines/storage/s0/properties.nix +++ b/machines/storage/s0/properties.nix @@ -18,6 +18,7 @@ "linkwarden" "outline" "dns-challenge" + "ntfy" ]; hostKey = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIAwiXcUFtAvZCayhu4+AIcF+Ktrdgv9ee/mXSIhJbp4q"; diff --git a/machines/zoidberg/properties.nix b/machines/zoidberg/properties.nix index 923ad9f..5d55595 100644 --- a/machines/zoidberg/properties.nix +++ b/machines/zoidberg/properties.nix @@ -8,6 +8,7 @@ systemRoles = [ "personal" "media-center" + "ntfy" ]; hostKey = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIHvdC1EiLqSNVmk5L1p7cWRIrrlelbK+NMj6tEBrwqIq";