Fix VPN check alert limiting to only count failures
StartLimitBurst counts all starts (including successes), so the timer was getting blocked after ~15 min. Replace with a JSON counter file that resets on success and daily, only triggering OnFailure alerts for the first 3 failures per day.
This commit is contained in:
@@ -228,44 +228,73 @@ in
|
|||||||
};
|
};
|
||||||
|
|
||||||
# Periodic VPN connectivity check — fails if VPN or internet is down,
|
# Periodic VPN connectivity check — fails if VPN or internet is down,
|
||||||
# triggering ntfy alert via the OnFailure drop-in
|
# triggering ntfy alert via the OnFailure drop-in.
|
||||||
|
# Tracks failures with a counter file so only the first 3 failures per
|
||||||
|
# day trigger an alert (subsequent failures exit 0 to suppress noise).
|
||||||
systemd.services.pia-vpn-check = {
|
systemd.services.pia-vpn-check = {
|
||||||
description = "Check PIA VPN connectivity";
|
description = "Check PIA VPN connectivity";
|
||||||
after = [ "pia-vpn-setup.service" ];
|
after = [ "pia-vpn-setup.service" ];
|
||||||
requires = [ "pia-vpn-setup.service" ];
|
requires = [ "pia-vpn-setup.service" ];
|
||||||
|
|
||||||
path = with pkgs; [ wireguard-tools iputils coreutils gawk ];
|
path = with pkgs; [ wireguard-tools iputils coreutils gawk jq ];
|
||||||
|
|
||||||
unitConfig = {
|
|
||||||
StartLimitBurst = 3;
|
|
||||||
StartLimitIntervalSec = "1d";
|
|
||||||
};
|
|
||||||
|
|
||||||
serviceConfig.Type = "oneshot";
|
serviceConfig.Type = "oneshot";
|
||||||
|
|
||||||
script = ''
|
script = ''
|
||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
|
|
||||||
# Check that WireGuard has a peer with a recent handshake (within 3 minutes)
|
COUNTER_FILE="/var/lib/pia-vpn/check-fail-count.json"
|
||||||
handshake=$(wg show ${cfg.interfaceName} latest-handshakes | awk '{print $2}')
|
MAX_ALERTS=3
|
||||||
if [ -z "$handshake" ] || [ "$handshake" -eq 0 ]; then
|
|
||||||
echo "No WireGuard handshake recorded" >&2
|
check_vpn() {
|
||||||
exit 1
|
# Check that WireGuard has a peer with a recent handshake (within 3 minutes)
|
||||||
fi
|
handshake=$(wg show ${cfg.interfaceName} latest-handshakes | awk '{print $2}')
|
||||||
now=$(date +%s)
|
if [ -z "$handshake" ] || [ "$handshake" -eq 0 ]; then
|
||||||
age=$((now - handshake))
|
echo "No WireGuard handshake recorded" >&2
|
||||||
if [ "$age" -gt 180 ]; then
|
return 1
|
||||||
echo "WireGuard handshake is stale (''${age}s ago)" >&2
|
fi
|
||||||
exit 1
|
now=$(date +%s)
|
||||||
|
age=$((now - handshake))
|
||||||
|
if [ "$age" -gt 180 ]; then
|
||||||
|
echo "WireGuard handshake is stale (''${age}s ago)" >&2
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Verify internet connectivity through VPN tunnel
|
||||||
|
if ! ping -c1 -W10 1.1.1.1 >/dev/null 2>&1; then
|
||||||
|
echo "Cannot reach internet through VPN" >&2
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "PIA VPN connectivity OK (handshake ''${age}s ago)"
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
if check_vpn; then
|
||||||
|
rm -f "$COUNTER_FILE"
|
||||||
|
exit 0
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Verify internet connectivity through VPN tunnel
|
# Failed — read and update counter (reset if from a previous day)
|
||||||
if ! ping -c1 -W10 1.1.1.1 >/dev/null 2>&1; then
|
today=$(date +%Y-%m-%d)
|
||||||
echo "Cannot reach internet through VPN" >&2
|
count=0
|
||||||
exit 1
|
if [ -f "$COUNTER_FILE" ]; then
|
||||||
|
stored=$(jq -r '.date // ""' "$COUNTER_FILE")
|
||||||
|
if [ "$stored" = "$today" ]; then
|
||||||
|
count=$(jq -r '.count // 0' "$COUNTER_FILE")
|
||||||
|
fi
|
||||||
fi
|
fi
|
||||||
|
count=$((count + 1))
|
||||||
|
jq -n --arg date "$today" --argjson count "$count" \
|
||||||
|
'{"date": $date, "count": $count}' > "$COUNTER_FILE"
|
||||||
|
|
||||||
echo "PIA VPN connectivity OK (handshake ''${age}s ago)"
|
if [ "$count" -le "$MAX_ALERTS" ]; then
|
||||||
|
echo "Failure $count/$MAX_ALERTS today — alerting" >&2
|
||||||
|
exit 1
|
||||||
|
else
|
||||||
|
echo "Failure $count today — suppressing alert (already sent $MAX_ALERTS)" >&2
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
'';
|
'';
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user