9 Commits

Author SHA1 Message Date
gitea-runner
412dd12b5a flake.lock: update inputs
Some checks failed
Check Flake / check-flake (push) Successful in 2m22s
Auto Update Flake / auto-update (push) Failing after 2m40s
2026-02-22 22:01:06 -08:00
684851d641 Prevent containers from running non-container services
All checks were successful
Check Flake / check-flake (push) Successful in 2m21s
Auto Update Flake / auto-update (push) Successful in 3m29s
2026-02-22 18:18:05 -08:00
4cf50b5fb1 Restart atticd whenever PostgreSQL restarts
All checks were successful
Check Flake / check-flake (push) Successful in 3m7s
2026-02-22 17:53:46 -08:00
288a2841aa Replace Uptime Kuma with Gatus for declarative uptime monitoring
All checks were successful
Check Flake / check-flake (push) Successful in 2m4s
Gatus is configured entirely via YAML (mapped from Nix attrsets),
making nix-config the single source of truth for all monitoring
config instead of Uptime Kuma's web UI/SQLite database.
2026-02-22 17:30:03 -08:00
0589ca5748 Add attic binary cache to sandboxed workspaces
Update the attic cache URL from s0.koi-bebop.ts.net to s0.neet.dev
and configure sandboxed workspaces to inherit the host's binary cache
settings (substituters, trusted keys, netrc auth via agenix).
2026-02-22 17:22:44 -08:00
a4c5cb589a Claude workspaces 2026-02-22 17:19:48 -08:00
a697ea10ad Add daily ZFS health check with ntfy alerts and introduce ntfy role
Add a zfs-alerts module that runs a daily health check on ZFS machines,
sending detailed ntfy notifications for degraded pools, data errors, or
drive errors. Introduce an "ntfy" system role to decouple ntfy alerting
from the server/personal roles, and assign it to all machines.
2026-02-22 17:17:40 -08:00
200d5a5d22 Add ntfy failure alerts for all systemd services
All checks were successful
Check Flake / check-flake (push) Successful in 3m18s
2026-02-22 16:19:43 -08:00
339eac52c6 Add uptime kuma
All checks were successful
Check Flake / check-flake (push) Successful in 9m15s
2026-02-22 15:49:26 -08:00
20 changed files with 356 additions and 10 deletions

1
.gitignore vendored
View File

@@ -1 +1,2 @@
result
.claude/worktrees

View File

@@ -67,6 +67,12 @@ IP allocation convention: VMs `.10-.49`, containers `.50-.89`, incus `.90-.129`
`flake.nix` applies patches from `/patches/` to nixpkgs before building (workaround for nix#3920).
### Service Dashboard & Monitoring
When adding or removing a web-facing service, update both:
- **Gatus** (`common/server/gatus.nix`) — add/remove the endpoint monitor
- **Dashy** — add/remove the service entry from the dashboard config
### Key Conventions
- Uses `doas` instead of `sudo` everywhere
@@ -79,3 +85,9 @@ IP allocation convention: VMs `.10-.49`, containers `.50-.89`, incus `.90-.129`
- Always use `--no-link` when running `nix build`
- Don't use `nix build --dry-run` unless you only need evaluation — it skips the actual build
- Avoid `2>&1` on nix commands — it can cause error output to be missed
## Git Worktree Requirement
When instructed to work in a git worktree (e.g., via `isolation: "worktree"` or told to use a worktree), you **MUST** do so. If you are unable to create or use a git worktree, you **MUST** stop work immediately and report the failure to the user. Do not fall back to working in the main working tree.
When applying work from a git worktree back to the main branch, commit in the worktree first, then use `git cherry-pick` from the main working tree to bring the commit over. Do not use `git checkout` or `git apply` to copy files directly. Do **not** automatically apply worktree work to the main branch — always ask the user for approval first.

View File

@@ -6,7 +6,7 @@
substituters = [
"https://cache.nixos.org/"
"https://nix-community.cachix.org"
"http://s0.koi-bebop.ts.net:28338/nixos"
"http://s0.neet.dev:28338/nixos"
];
trusted-public-keys = [
"nix-community.cachix.org-1:mB9FSh9qf2dCimDSUo8Zy7bkq5CX+/rkCWyvRCYg3Fs="

View File

@@ -6,6 +6,8 @@
./binary-cache.nix
./flakes.nix
./auto-update.nix
./ntfy-alerts.nix
./zfs-alerts.nix
./shell.nix
./network
./boot
@@ -100,5 +102,5 @@
security.acme.defaults.email = "zuckerberg@neet.dev";
# Enable Desktop Environment if this is a PC (machine role is "personal")
de.enable = lib.mkDefault (config.thisMachine.hasRole."personal");
de.enable = lib.mkDefault (config.thisMachine.hasRole."personal" && !config.boot.isContainer);
}

View File

@@ -12,7 +12,7 @@ let
in
lib.mkMerge [
# configure builder
(lib.mkIf thisMachineIsABuilder {
(lib.mkIf (thisMachineIsABuilder && !config.boot.isContainer) {
users.users.${builderUserName} = {
description = "Distributed Nix Build User";
group = builderUserName;

57
common/ntfy-alerts.nix Normal file
View File

@@ -0,0 +1,57 @@
{ config, lib, pkgs, ... }:
let
cfg = config.ntfy-alerts;
in
{
options.ntfy-alerts = {
serverUrl = lib.mkOption {
type = lib.types.str;
default = "https://ntfy.neet.dev";
description = "Base URL of the ntfy server.";
};
topic = lib.mkOption {
type = lib.types.str;
default = "service-failures";
description = "ntfy topic to publish alerts to.";
};
};
config = lib.mkIf config.thisMachine.hasRole."ntfy" {
age.secrets.ntfy-token.file = ../secrets/ntfy-token.age;
systemd.services."ntfy-failure@" = {
description = "Send ntfy alert for failed unit %i";
wants = [ "network-online.target" ];
after = [ "network-online.target" ];
serviceConfig = {
Type = "oneshot";
EnvironmentFile = "/run/agenix/ntfy-token";
ExecStart = "${pkgs.writeShellScript "ntfy-failure-notify" ''
unit="$1"
${lib.getExe pkgs.curl} \
--fail --silent --show-error \
--max-time 30 --retry 3 \
-H "Authorization: Bearer $NTFY_TOKEN" \
-H "Title: Service failure on ${config.networking.hostName}" \
-H "Priority: high" \
-H "Tags: rotating_light" \
-d "Unit $unit failed at $(date +%c)" \
"${cfg.serverUrl}/${cfg.topic}"
''} %i";
};
};
# Apply OnFailure to all services via a systemd drop-in
systemd.packages = [
(pkgs.runCommand "ntfy-on-failure-dropin" { } ''
mkdir -p $out/lib/systemd/system/service.d
cat > $out/lib/systemd/system/service.d/ntfy-on-failure.conf <<'EOF'
[Unit]
OnFailure=ntfy-failure@%p.service
EOF
'')
];
};
}

View File

@@ -27,6 +27,7 @@ in
../shell.nix
hostConfig.inputs.home-manager.nixosModules.home-manager
hostConfig.inputs.nix-index-database.nixosModules.default
hostConfig.inputs.agenix.nixosModules.default
];
nixpkgs.overlays = [
@@ -116,6 +117,13 @@ in
nix.settings.experimental-features = [ "nix-command" "flakes" ];
nix.settings.trusted-users = [ "googlebot" ];
# Binary cache configuration (inherited from host's common/binary-cache.nix)
nix.settings.substituters = hostConfig.nix.settings.substituters;
nix.settings.trusted-public-keys = hostConfig.nix.settings.trusted-public-keys;
nix.settings.fallback = true;
nix.settings.netrc-file = config.age.secrets.attic-netrc.path;
age.secrets.attic-netrc.file = ../../secrets/attic-netrc.age;
# Make nixpkgs available in NIX_PATH and registry (like the NixOS ISO)
# This allows `nix-shell -p`, `nix repl '<nixpkgs>'`, etc. to work
nix.nixPath = [ "nixpkgs=${hostConfig.inputs.nixpkgs}" ];

View File

@@ -1,7 +1,7 @@
{ config, lib, ... }:
{
config = lib.mkIf (config.thisMachine.hasRole."binary-cache") {
config = lib.mkIf (config.thisMachine.hasRole."binary-cache" && !config.boot.isContainer) {
services.atticd = {
enable = true;
environmentFile = config.age.secrets.atticd-credentials.path;
@@ -49,6 +49,7 @@
systemd.services.atticd = {
after = [ "postgresql.service" ];
requires = [ "postgresql.service" ];
partOf = [ "postgresql.service" ];
serviceConfig = {
DynamicUser = lib.mkForce false;
User = "atticd";

View File

@@ -17,5 +17,6 @@
./actualbudget.nix
./unifi.nix
./ntfy.nix
./gatus.nix
];
}

146
common/server/gatus.nix Normal file
View File

@@ -0,0 +1,146 @@
{ lib, config, ... }:
let
cfg = config.services.gatus;
port = 31103;
in
{
options.services.gatus = {
hostname = lib.mkOption {
type = lib.types.str;
example = "status.example.com";
};
};
config = lib.mkIf cfg.enable {
services.gatus = {
environmentFile = "/run/agenix/ntfy-token";
settings = {
storage = {
type = "sqlite";
path = "/var/lib/gatus/data.db";
};
web = {
address = "127.0.0.1";
port = port;
};
alerting.ntfy = {
url = "https://ntfy.neet.dev";
topic = "service-failures";
priority = 4;
default-alert = {
enabled = true;
failure-threshold = 3;
success-threshold = 2;
send-on-resolved = true;
};
token = "$NTFY_TOKEN";
};
endpoints = [
{
name = "Gitea";
group = "services";
url = "https://git.neet.dev";
interval = "5m";
conditions = [
"[STATUS] == 200"
];
alerts = [{ type = "ntfy"; }];
}
{
name = "The Lounge";
group = "services";
url = "https://irc.neet.dev";
interval = "5m";
conditions = [
"[STATUS] == 200"
];
alerts = [{ type = "ntfy"; }];
}
{
name = "ntfy";
group = "services";
url = "https://ntfy.neet.dev/v1/health";
interval = "5m";
conditions = [
"[STATUS] == 200"
];
alerts = [{ type = "ntfy"; }];
}
{
name = "Librechat";
group = "services";
url = "https://chat.neet.dev";
interval = "5m";
conditions = [
"[STATUS] == 200"
];
alerts = [{ type = "ntfy"; }];
}
{
name = "Owncast";
group = "services";
url = "https://live.neet.dev";
interval = "5m";
conditions = [
"[STATUS] == 200"
];
alerts = [{ type = "ntfy"; }];
}
{
name = "Nextcloud";
group = "services";
url = "https://neet.cloud";
interval = "5m";
conditions = [
"[STATUS] == any(200, 302)"
];
alerts = [{ type = "ntfy"; }];
}
{
name = "Element Web";
group = "services";
url = "https://chat.neet.space";
interval = "5m";
conditions = [
"[STATUS] == 200"
];
alerts = [{ type = "ntfy"; }];
}
{
name = "Mumble";
group = "services";
url = "tcp://voice.neet.space:23563";
interval = "5m";
conditions = [
"[CONNECTED] == true"
];
alerts = [{ type = "ntfy"; }];
}
{
name = "Navidrome";
group = "services";
url = "https://navidrome.neet.cloud";
interval = "5m";
conditions = [
"[STATUS] == 200"
];
alerts = [{ type = "ntfy"; }];
}
];
};
};
services.nginx.enable = true;
services.nginx.virtualHosts.${cfg.hostname} = {
enableACME = true;
forceSSL = true;
locations."/" = {
proxyPass = "http://127.0.0.1:${toString port}";
proxyWebsockets = true;
};
};
};
}

87
common/zfs-alerts.nix Normal file
View File

@@ -0,0 +1,87 @@
{ config, lib, pkgs, ... }:
let
cfg = config.ntfy-alerts;
hasZfs = config.boot.supportedFilesystems.zfs or false;
hasNtfy = config.thisMachine.hasRole."ntfy";
checkScript = pkgs.writeShellScript "zfs-health-check" ''
PATH="${lib.makeBinPath [ pkgs.zfs pkgs.coreutils pkgs.gawk pkgs.curl ]}"
unhealthy=""
# Check pool health status
while IFS=$'\t' read -r pool state; do
if [ "$state" != "ONLINE" ]; then
unhealthy="$unhealthy"$'\n'"Pool '$pool' is $state"
fi
done < <(zpool list -H -o name,health)
# Check for errors (read, write, checksum) on any vdev
while IFS=$'\t' read -r pool errors; do
if [ "$errors" != "No known data errors" ] && [ -n "$errors" ]; then
unhealthy="$unhealthy"$'\n'"Pool '$pool' has errors: $errors"
fi
done < <(zpool status -x 2>/dev/null | awk '
/pool:/ { pool=$2 }
/errors:/ { sub(/^[[:space:]]*errors: /, ""); print pool "\t" $0 }
')
# Check for any drives with non-zero error counts
drive_errors=$(zpool status 2>/dev/null | awk '
/DEGRADED|FAULTED|OFFLINE|UNAVAIL|REMOVED/ && !/pool:/ && !/state:/ {
print " " $0
}
/[0-9]+[[:space:]]+[0-9]+[[:space:]]+[0-9]+/ {
if ($3 > 0 || $4 > 0 || $5 > 0) {
print " " $1 " (read:" $3 " write:" $4 " cksum:" $5 ")"
}
}
')
if [ -n "$drive_errors" ]; then
unhealthy="$unhealthy"$'\n'"Device errors:"$'\n'"$drive_errors"
fi
if [ -n "$unhealthy" ]; then
message="ZFS health check failed on ${config.networking.hostName}:$unhealthy"
curl \
--fail --silent --show-error \
--max-time 30 --retry 3 \
-H "Authorization: Bearer $NTFY_TOKEN" \
-H "Title: ZFS issue on ${config.networking.hostName}" \
-H "Priority: urgent" \
-H "Tags: warning" \
-d "$message" \
"${cfg.serverUrl}/${cfg.topic}"
echo "$message" >&2
fi
echo "All ZFS pools healthy"
'';
in
{
config = lib.mkIf (hasZfs && hasNtfy) {
systemd.services.zfs-health-check = {
description = "Check ZFS pool health and alert on issues";
wants = [ "network-online.target" ];
after = [ "network-online.target" "zfs.target" ];
serviceConfig = {
Type = "oneshot";
EnvironmentFile = "/run/agenix/ntfy-token";
ExecStart = checkScript;
};
};
systemd.timers.zfs-health-check = {
description = "Periodic ZFS health check";
wantedBy = [ "timers.target" ];
timerConfig = {
OnCalendar = "daily";
Persistent = true;
RandomizedDelaySec = "1h";
};
};
};
}

6
flake.lock generated
View File

@@ -250,11 +250,11 @@
"spectrum": "spectrum"
},
"locked": {
"lastModified": 1771712688,
"narHash": "sha256-Pf4CaRoOLQV02m2POPA+0EWvb3gVdpaiS0hNNVZhO3c=",
"lastModified": 1771802632,
"narHash": "sha256-UAH8YfrHRvXAMeFxUzJ4h4B1loz1K1wiNUNI8KiPqOg=",
"owner": "astro",
"repo": "microvm.nix",
"rev": "a3abc020a3d8e624e145f4144ed40702f788ea32",
"rev": "b67e3d80df3ec35bdfd3a00ad64ee437ef4fcded",
"type": "github"
},
"original": {

View File

@@ -8,6 +8,7 @@
systemRoles = [
"personal"
"dns-challenge"
"ntfy"
];
hostKey = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAID/Df5lG07Il7fizEgZR/T9bMlR0joESRJ7cqM9BkOyP";

View File

@@ -7,6 +7,7 @@
systemRoles = [
"personal"
"ntfy"
];
hostKey = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIEQi3q8jU6vRruExAL60J7GFO1gS8HsmXVJuKRT4ljrG";

View File

@@ -112,4 +112,8 @@
# push notifications
services.ntfy-sh.enable = true;
services.ntfy-sh.hostname = "ntfy.neet.dev";
# uptime monitoring
services.gatus.enable = true;
services.gatus.hostname = "status.neet.dev";
}

View File

@@ -15,6 +15,7 @@
"dailybot"
"gitea"
"librechat"
"ntfy"
];
hostKey = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIMBBlTAIp38RhErU1wNNV5MBeb+WGH0mhF/dxh5RsAXN";

View File

@@ -18,6 +18,7 @@
"linkwarden"
"outline"
"dns-challenge"
"ntfy"
];
hostKey = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIAwiXcUFtAvZCayhu4+AIcF+Ktrdgv9ee/mXSIhJbp4q";

View File

@@ -8,6 +8,7 @@
systemRoles = [
"personal"
"media-center"
"ntfy"
];
hostKey = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIHvdC1EiLqSNVmk5L1p7cWRIrrlelbK+NMj6tEBrwqIq";

19
secrets/ntfy-token.age Normal file
View File

@@ -0,0 +1,19 @@
age-encryption.org/v1
-> ssh-ed25519 qEbiMg 5JtpNApPNiFqAB/gQcAsE1gz0Fg/uHW92f6Kx1J2ggQ
RzC1MQxyDYW1IuMo+OtSgcsND4v7XIRn0rCSkKCFA3A
-> ssh-ed25519 N7drjg mn6LWo+2zWEtUavbFQar966+j+g5su+lcBfWYz1aZDQ
EmKpdfkCSQao1+O/HJdOiam7UvBnDYcEEkgH6KrudQI
-> ssh-ed25519 jQaHAA an3Ukqz3BVoz0FEAA6/Lw1XKOkQWHwmTut+XD4E4vS8
9N2ePtXG2FPJSmOwcAO9p92MJKJJpTlEhKSmgMiinB0
-> ssh-ed25519 ZDy34A v98HzmBgwgOpUk2WrRuFsCdNR+nF2veVLzyT2pU2ZXY
o2pO5JbVEeaOFQ3beBvej6qgDdT9mgPCVHxmw2umhA0
-> ssh-ed25519 w3nu8g Uba2LWQueJ50Ds1/RjvkXI+VH7calMiM7dbL02sRJ3U
mFj5skmDXhJV9lK5iwUpebqxqVPAexdUntrbWJEix+Q
-> ssh-ed25519 evqvfg wLEEdTdmDRiFGYDrYFQjvRzc3zmGXgvztIy3UuFXnWg
CtCV/PpaxBmtDV+6InmcbKxNDmbPUTzyCm8tCf1Qw/4
-> ssh-ed25519 6AT2/g NaA1AhOdb+FGOCoWFEX0QN4cXS1CxlpFwpsH1L6vBA0
Z9aMYAofQ4Ath0zsg0YdZG3GTnCN2uQW7EG02bMZRsc
-> ssh-ed25519 hPp1nw +shAZrydcbjXfYxm1UW1YosKg5ZwBBKO6cct4HdotBo
GxnopKlmZQ/I6kMZPNurLgqwwkFHpUradaNYTPnlMFU
--- W6DrhCmU08IEIyPpHDiRV21xVeALNk1bDHrLYc2YcC4
'+Æ©<C386>CËl”i.Z£t;ýL²1Eãk#5ŸŠÑ£38‰?± šFWhö6?±_Ã=Âæ<C382>ãj/æÌRFÆáãåiÿóãÌZ{

View File

@@ -9,7 +9,7 @@ let
nobody = sshKeys.userKeys;
# For secrets that all machines need to know
everyone = roles.personal ++ roles.server;
everyone = lib.unique (roles.personal ++ roles.server);
in
with roles;
@@ -43,8 +43,11 @@ with roles;
"linkwarden-environment.age".publicKeys = linkwarden;
# backups
"backblaze-s3-backups.age".publicKeys = personal ++ server;
"restic-password.age".publicKeys = personal ++ server;
"backblaze-s3-backups.age".publicKeys = everyone;
"restic-password.age".publicKeys = everyone;
# ntfy alerts
"ntfy-token.age".publicKeys = everyone;
# gitea actions runner
"gitea-actions-runner-token.age".publicKeys = gitea-actions-runner;