4 Commits

Author SHA1 Message Date
bb39587292 Fix unifi service taking 5+ minutes to shut down
Some checks failed
Check Flake / check-flake (push) Failing after 4m8s
UniFi's Java process crashes during shutdown (Spring context race
condition) leaving mongod orphaned in the cgroup. The upstream module
sets KillSignal=SIGCONT so systemd won't interrupt the graceful
shutdown, but with the default KillMode=control-group this means
mongod also only gets SIGCONT (a no-op) and sits there until the
5-minute timeout triggers SIGKILL.

Switch to KillMode=mixed so the main Java process still gets the
harmless SIGCONT while mongod gets a proper SIGTERM for a clean
database shutdown.
2026-03-03 22:02:21 -08:00
712b52a48d Capture full systemd unit name for ntfy error alerts 2026-03-03 21:46:45 -08:00
c6eeea982e Add ignoredUnits option; skip logrotate failures on s0 because they are spurious 2026-03-03 21:46:19 -08:00
6bd1b4466e Update claude.md 2026-03-03 21:43:36 -08:00
5 changed files with 24 additions and 15 deletions

View File

@@ -85,17 +85,3 @@ When adding or removing a web-facing service, update both:
- Always use `--no-link` when running `nix build` - Always use `--no-link` when running `nix build`
- Don't use `nix build --dry-run` unless you only need evaluation — it skips the actual build - Don't use `nix build --dry-run` unless you only need evaluation — it skips the actual build
- Avoid `2>&1` on nix commands — it can cause error output to be missed - Avoid `2>&1` on nix commands — it can cause error output to be missed
## Git Worktrees
When the user asks you to "start a worktree" or work in a worktree, **do not create one manually** with `git worktree add`. Instead, tell the user to start a new session with:
```bash
claude --worktree <name>
```
This is the built-in Claude Code worktree workflow. It creates the worktree at `.claude/worktrees/<name>/` with a branch `worktree-<name>` and starts a new Claude session inside it. Cleanup is handled automatically on exit.
When instructed to work in a git worktree (e.g., via `isolation: "worktree"` on a subagent), you **MUST** do so. If you are unable to create or use a git worktree, you **MUST** stop work immediately and report the failure to the user. Do not fall back to working in the main working tree.
When applying work from a git worktree back to the main branch, commit in the worktree first, then use `git cherry-pick` from the main working tree to bring the commit over. Do not use `git checkout` or `git apply` to copy files directly. Do **not** automatically apply worktree work to the main branch — always ask the user for approval first.

View File

@@ -19,6 +19,12 @@
default = ""; default = "";
description = "Extra arguments to pass to curl (e.g. --proxy http://host:port)."; description = "Extra arguments to pass to curl (e.g. --proxy http://host:port).";
}; };
ignoredUnits = lib.mkOption {
type = lib.types.listOf lib.types.str;
default = [ ];
description = "Unit names to skip failure notifications for.";
};
}; };
config = lib.mkIf config.thisMachine.hasRole."ntfy" { config = lib.mkIf config.thisMachine.hasRole."ntfy" {

View File

@@ -14,6 +14,12 @@ in
EnvironmentFile = "/run/agenix/ntfy-token"; EnvironmentFile = "/run/agenix/ntfy-token";
ExecStart = "${pkgs.writeShellScript "ntfy-failure-notify" '' ExecStart = "${pkgs.writeShellScript "ntfy-failure-notify" ''
unit="$1" unit="$1"
ignored_units=(${lib.concatMapStringsSep " " (u: lib.escapeShellArg u) cfg.ignoredUnits})
for ignored in "''${ignored_units[@]}"; do
if [[ "$unit" == "$ignored" ]]; then
exit 0
fi
done
logfile=$(mktemp) logfile=$(mktemp)
trap 'rm -f "$logfile"' EXIT trap 'rm -f "$logfile"' EXIT
${pkgs.systemd}/bin/journalctl -u "$unit" -n 50 --no-pager -o short > "$logfile" 2>/dev/null \ ${pkgs.systemd}/bin/journalctl -u "$unit" -n 50 --no-pager -o short > "$logfile" 2>/dev/null \
@@ -40,7 +46,7 @@ in
mkdir -p $out/lib/systemd/system/service.d mkdir -p $out/lib/systemd/system/service.d
cat > $out/lib/systemd/system/service.d/ntfy-on-failure.conf <<'EOF' cat > $out/lib/systemd/system/service.d/ntfy-on-failure.conf <<'EOF'
[Unit] [Unit]
OnFailure=ntfy-failure@%p.service OnFailure=ntfy-failure@%N.service
EOF EOF
'') '')
]; ];

View File

@@ -13,6 +13,15 @@ in
services.unifi.unifiPackage = pkgs.unifi; services.unifi.unifiPackage = pkgs.unifi;
services.unifi.mongodbPackage = pkgs.mongodb-7_0; services.unifi.mongodbPackage = pkgs.mongodb-7_0;
# The upstream module sets KillSignal=SIGCONT so systemd doesn't interfere
# with UniFi's self-managed shutdown. But UniFi's Java process crashes during
# shutdown (Spring context already closed) leaving mongod orphaned in the
# cgroup. With the default KillMode=control-group, mongod only gets SIGCONT
# (a no-op) and runs until the 5min timeout triggers SIGKILL.
# KillMode=mixed sends SIGCONT to the main process but SIGTERM to remaining
# children, giving mongod a clean shutdown instead of SIGKILL.
systemd.services.unifi.serviceConfig.KillMode = "mixed";
networking.firewall = lib.mkIf cfg.openMinimalFirewall { networking.firewall = lib.mkIf cfg.openMinimalFirewall {
allowedUDPPorts = [ allowedUDPPorts = [
3478 # STUN 3478 # STUN

View File

@@ -9,6 +9,8 @@
networking.hostName = "s0"; networking.hostName = "s0";
ntfy-alerts.ignoredUnits = [ "logrotate" ];
# system.autoUpgrade.enable = true; # system.autoUpgrade.enable = true;
nix.gc.automatic = lib.mkForce false; # allow the nix store to serve as a build cache nix.gc.automatic = lib.mkForce false; # allow the nix store to serve as a build cache