Files
nix-config/common/ntfy/dimm-temp.nix
Zuckerberg 1e7aa17d3d
All checks were successful
Check Flake / check-flake (push) Successful in 3m58s
Log DIMM temperatures on each check run
2026-03-05 21:31:29 -08:00

74 lines
2.0 KiB
Nix

{ config, lib, pkgs, ... }:
let
cfg = config.ntfy-alerts;
hasNtfy = config.thisMachine.hasRole."ntfy";
checkScript = pkgs.writeShellScript "dimm-temp-check" ''
PATH="${lib.makeBinPath [ pkgs.lm_sensors pkgs.gawk pkgs.coreutils pkgs.curl ]}"
threshold=55
hot=""
summary=""
while IFS= read -r line; do
case "$line" in
spd5118-*)
chip="$line"
;;
*temp1_input:*)
temp="''${line##*: }"
whole="''${temp%%.*}"
summary="''${summary:+$summary, }$chip: ''${temp}°C"
if [ "$whole" -ge "$threshold" ]; then
hot="$hot"$'\n'" $chip: ''${temp}°C"
fi
;;
esac
done < <(sensors -u 'spd5118-*' 2>/dev/null)
echo "$summary"
if [ -n "$hot" ]; then
message="DIMM temperature above ''${threshold}°C on ${config.networking.hostName}:$hot"
curl \
--fail --silent --show-error \
--max-time 30 --retry 3 \
-H "Authorization: Bearer $NTFY_TOKEN" \
-H "Title: High DIMM temperature on ${config.networking.hostName}" \
-H "Priority: high" \
-H "Tags: thermometer" \
-d "$message" \
"${cfg.serverUrl}/service-failures"
echo "$message" >&2
fi
'';
in
{
options.ntfy-alerts.dimmTempCheck.enable = lib.mkEnableOption "DDR5 DIMM temperature monitoring via spd5118";
config = lib.mkIf (cfg.dimmTempCheck.enable && hasNtfy) {
systemd.services.dimm-temp-check = {
description = "Check DDR5 DIMM temperatures and alert on overheating";
wants = [ "network-online.target" ];
after = [ "network-online.target" ];
serviceConfig = {
Type = "oneshot";
EnvironmentFile = "/run/agenix/ntfy-token";
ExecStart = checkScript;
};
};
systemd.timers.dimm-temp-check = {
description = "Periodic DDR5 DIMM temperature check";
wantedBy = [ "timers.target" ];
timerConfig = {
OnCalendar = "*:0/5";
Persistent = true;
};
};
};
}