From 335abe4e65455f01a38477a66309fcce92356aaf Mon Sep 17 00:00:00 2001 From: Zuckerberg Date: Wed, 4 Mar 2026 21:24:40 -0800 Subject: [PATCH] Add DDR5 DIMM temperature monitoring with ntfy alerts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Monitors spd5118 sensors every 5 minutes and sends an ntfy notification if any DIMM exceeds 55°C. Opt-in via ntfy-alerts.dimmTempCheck.enable, enabled on s0. --- common/ntfy/default.nix | 1 + common/ntfy/dimm-temp.nix | 69 +++++++++++++++++++++++++++++++++ machines/storage/s0/default.nix | 1 + 3 files changed, 71 insertions(+) create mode 100644 common/ntfy/dimm-temp.nix diff --git a/common/ntfy/default.nix b/common/ntfy/default.nix index e5c8ac1..d917040 100644 --- a/common/ntfy/default.nix +++ b/common/ntfy/default.nix @@ -5,6 +5,7 @@ ./service-failure.nix ./ssh-login.nix ./zfs.nix + ./dimm-temp.nix ]; options.ntfy-alerts = { diff --git a/common/ntfy/dimm-temp.nix b/common/ntfy/dimm-temp.nix new file mode 100644 index 0000000..dd61d11 --- /dev/null +++ b/common/ntfy/dimm-temp.nix @@ -0,0 +1,69 @@ +{ config, lib, pkgs, ... }: + +let + cfg = config.ntfy-alerts; + hasNtfy = config.thisMachine.hasRole."ntfy"; + + checkScript = pkgs.writeShellScript "dimm-temp-check" '' + PATH="${lib.makeBinPath [ pkgs.lm_sensors pkgs.gawk pkgs.coreutils pkgs.curl ]}" + + threshold=55 + hot="" + + while IFS= read -r line; do + case "$line" in + spd5118-*) + chip="$line" + ;; + *temp1_input:*) + temp="''${line##*: }" + whole="''${temp%%.*}" + if [ "$whole" -ge "$threshold" ]; then + hot="$hot"$'\n'" $chip: ''${temp}°C" + fi + ;; + esac + done < <(sensors -u 'spd5118-*' 2>/dev/null) + + if [ -n "$hot" ]; then + message="DIMM temperature above ''${threshold}°C on ${config.networking.hostName}:$hot" + + curl \ + --fail --silent --show-error \ + --max-time 30 --retry 3 \ + -H "Authorization: Bearer $NTFY_TOKEN" \ + -H "Title: High DIMM temperature on ${config.networking.hostName}" \ + -H "Priority: high" \ + -H "Tags: thermometer" \ + -d "$message" \ + "${cfg.serverUrl}/service-failures" + + echo "$message" >&2 + fi + ''; +in +{ + options.ntfy-alerts.dimmTempCheck.enable = lib.mkEnableOption "DDR5 DIMM temperature monitoring via spd5118"; + + config = lib.mkIf (cfg.dimmTempCheck.enable && hasNtfy) { + systemd.services.dimm-temp-check = { + description = "Check DDR5 DIMM temperatures and alert on overheating"; + wants = [ "network-online.target" ]; + after = [ "network-online.target" ]; + serviceConfig = { + Type = "oneshot"; + EnvironmentFile = "/run/agenix/ntfy-token"; + ExecStart = checkScript; + }; + }; + + systemd.timers.dimm-temp-check = { + description = "Periodic DDR5 DIMM temperature check"; + wantedBy = [ "timers.target" ]; + timerConfig = { + OnCalendar = "*:0/5"; + Persistent = true; + }; + }; + }; +} diff --git a/machines/storage/s0/default.nix b/machines/storage/s0/default.nix index 8583e15..b6a620f 100644 --- a/machines/storage/s0/default.nix +++ b/machines/storage/s0/default.nix @@ -10,6 +10,7 @@ networking.hostName = "s0"; ntfy-alerts.ignoredUnits = [ "logrotate" ]; + ntfy-alerts.dimmTempCheck.enable = true; # system.autoUpgrade.enable = true;