Add DDR5 DIMM temperature monitoring with ntfy alerts
Monitors spd5118 sensors every 5 minutes and sends an ntfy notification if any DIMM exceeds 55°C. Opt-in via ntfy-alerts.dimmTempCheck.enable, enabled on s0.
This commit is contained in:
@@ -5,6 +5,7 @@
|
||||
./service-failure.nix
|
||||
./ssh-login.nix
|
||||
./zfs.nix
|
||||
./dimm-temp.nix
|
||||
];
|
||||
|
||||
options.ntfy-alerts = {
|
||||
|
||||
69
common/ntfy/dimm-temp.nix
Normal file
69
common/ntfy/dimm-temp.nix
Normal file
@@ -0,0 +1,69 @@
|
||||
{ config, lib, pkgs, ... }:
|
||||
|
||||
let
|
||||
cfg = config.ntfy-alerts;
|
||||
hasNtfy = config.thisMachine.hasRole."ntfy";
|
||||
|
||||
checkScript = pkgs.writeShellScript "dimm-temp-check" ''
|
||||
PATH="${lib.makeBinPath [ pkgs.lm_sensors pkgs.gawk pkgs.coreutils pkgs.curl ]}"
|
||||
|
||||
threshold=55
|
||||
hot=""
|
||||
|
||||
while IFS= read -r line; do
|
||||
case "$line" in
|
||||
spd5118-*)
|
||||
chip="$line"
|
||||
;;
|
||||
*temp1_input:*)
|
||||
temp="''${line##*: }"
|
||||
whole="''${temp%%.*}"
|
||||
if [ "$whole" -ge "$threshold" ]; then
|
||||
hot="$hot"$'\n'" $chip: ''${temp}°C"
|
||||
fi
|
||||
;;
|
||||
esac
|
||||
done < <(sensors -u 'spd5118-*' 2>/dev/null)
|
||||
|
||||
if [ -n "$hot" ]; then
|
||||
message="DIMM temperature above ''${threshold}°C on ${config.networking.hostName}:$hot"
|
||||
|
||||
curl \
|
||||
--fail --silent --show-error \
|
||||
--max-time 30 --retry 3 \
|
||||
-H "Authorization: Bearer $NTFY_TOKEN" \
|
||||
-H "Title: High DIMM temperature on ${config.networking.hostName}" \
|
||||
-H "Priority: high" \
|
||||
-H "Tags: thermometer" \
|
||||
-d "$message" \
|
||||
"${cfg.serverUrl}/service-failures"
|
||||
|
||||
echo "$message" >&2
|
||||
fi
|
||||
'';
|
||||
in
|
||||
{
|
||||
options.ntfy-alerts.dimmTempCheck.enable = lib.mkEnableOption "DDR5 DIMM temperature monitoring via spd5118";
|
||||
|
||||
config = lib.mkIf (cfg.dimmTempCheck.enable && hasNtfy) {
|
||||
systemd.services.dimm-temp-check = {
|
||||
description = "Check DDR5 DIMM temperatures and alert on overheating";
|
||||
wants = [ "network-online.target" ];
|
||||
after = [ "network-online.target" ];
|
||||
serviceConfig = {
|
||||
Type = "oneshot";
|
||||
EnvironmentFile = "/run/agenix/ntfy-token";
|
||||
ExecStart = checkScript;
|
||||
};
|
||||
};
|
||||
|
||||
systemd.timers.dimm-temp-check = {
|
||||
description = "Periodic DDR5 DIMM temperature check";
|
||||
wantedBy = [ "timers.target" ];
|
||||
timerConfig = {
|
||||
OnCalendar = "*:0/5";
|
||||
Persistent = true;
|
||||
};
|
||||
};
|
||||
};
|
||||
}
|
||||
@@ -10,6 +10,7 @@
|
||||
networking.hostName = "s0";
|
||||
|
||||
ntfy-alerts.ignoredUnits = [ "logrotate" ];
|
||||
ntfy-alerts.dimmTempCheck.enable = true;
|
||||
|
||||
# system.autoUpgrade.enable = true;
|
||||
|
||||
|
||||
Reference in New Issue
Block a user