Add DDR5 DIMM temperature monitoring with ntfy alerts

Monitors spd5118 sensors every 5 minutes and sends an ntfy
notification if any DIMM exceeds 55°C. Opt-in via
ntfy-alerts.dimmTempCheck.enable, enabled on s0.
This commit is contained in:
2026-03-04 21:24:40 -08:00
parent 6267def09b
commit 335abe4e65
3 changed files with 71 additions and 0 deletions

View File

@@ -5,6 +5,7 @@
./service-failure.nix
./ssh-login.nix
./zfs.nix
./dimm-temp.nix
];
options.ntfy-alerts = {

69
common/ntfy/dimm-temp.nix Normal file
View File

@@ -0,0 +1,69 @@
{ config, lib, pkgs, ... }:
let
cfg = config.ntfy-alerts;
hasNtfy = config.thisMachine.hasRole."ntfy";
checkScript = pkgs.writeShellScript "dimm-temp-check" ''
PATH="${lib.makeBinPath [ pkgs.lm_sensors pkgs.gawk pkgs.coreutils pkgs.curl ]}"
threshold=55
hot=""
while IFS= read -r line; do
case "$line" in
spd5118-*)
chip="$line"
;;
*temp1_input:*)
temp="''${line##*: }"
whole="''${temp%%.*}"
if [ "$whole" -ge "$threshold" ]; then
hot="$hot"$'\n'" $chip: ''${temp}°C"
fi
;;
esac
done < <(sensors -u 'spd5118-*' 2>/dev/null)
if [ -n "$hot" ]; then
message="DIMM temperature above ''${threshold}°C on ${config.networking.hostName}:$hot"
curl \
--fail --silent --show-error \
--max-time 30 --retry 3 \
-H "Authorization: Bearer $NTFY_TOKEN" \
-H "Title: High DIMM temperature on ${config.networking.hostName}" \
-H "Priority: high" \
-H "Tags: thermometer" \
-d "$message" \
"${cfg.serverUrl}/service-failures"
echo "$message" >&2
fi
'';
in
{
options.ntfy-alerts.dimmTempCheck.enable = lib.mkEnableOption "DDR5 DIMM temperature monitoring via spd5118";
config = lib.mkIf (cfg.dimmTempCheck.enable && hasNtfy) {
systemd.services.dimm-temp-check = {
description = "Check DDR5 DIMM temperatures and alert on overheating";
wants = [ "network-online.target" ];
after = [ "network-online.target" ];
serviceConfig = {
Type = "oneshot";
EnvironmentFile = "/run/agenix/ntfy-token";
ExecStart = checkScript;
};
};
systemd.timers.dimm-temp-check = {
description = "Periodic DDR5 DIMM temperature check";
wantedBy = [ "timers.target" ];
timerConfig = {
OnCalendar = "*:0/5";
Persistent = true;
};
};
};
}

View File

@@ -10,6 +10,7 @@
networking.hostName = "s0";
ntfy-alerts.ignoredUnits = [ "logrotate" ];
ntfy-alerts.dimmTempCheck.enable = true;
# system.autoUpgrade.enable = true;