added rigby recovery

This commit is contained in:
Bryan Ramos 2026-04-10 22:05:59 -04:00
parent 194bb9c381
commit 532c874c96
12 changed files with 474 additions and 0 deletions

6
external/README.md vendored Normal file
View file

@ -0,0 +1,6 @@
# External Automation
This directory contains automation for systems that are not managed as NixOS
hosts inside this repository.
- `rigby/`: Ubuntu-based AI rig recovery and service automation.

64
external/rigby/README.md vendored Normal file
View file

@ -0,0 +1,64 @@
# Rigby Recovery
This directory contains disaster-recovery automation for `rigby`, the Ubuntu
AI rig at `192.168.0.23`.
## Scope
This automation manages the host state after a manual Ubuntu install.
It is intended to restore the working state we validated for:
- AMD ROCm `7.2.1`
- `amdgpu-dkms`
- `amdgpu.cwsr_enable=0`
- pinned ComfyUI checkout
- `uv`-managed Python `3.13` venv
- ROCm PyTorch
- ComfyUI service layout
- output sharing over Samba
- required groups and permissions
## Manual Prerequisites
These are intentionally documented, not automated:
- Install Ubuntu `24.04.4`
- Update BIOS to the known-good version for the board
- Verify BIOS settings:
- `Above 4G Decoding = Enabled`
- `SVM = Enabled`
- UEFI boot
- sane PCIe slot configuration
- Ensure host SSH is reachable as `bryan`
- Ensure passwordless sudo works for `bryan`
- Ensure the initial DHCP lease is known so recovery can begin
## Recovery Flow
1. Install Ubuntu manually.
2. Clone this repository onto the operator machine.
3. From the repo root, run `just rigby-check HOST=<rigby-ip>`.
4. Run `just rigby-recover HOST=<rigby-ip>`.
5. Reboot `rigby`.
6. Validate:
- `rocminfo`
- `rocm-smi`
- ComfyUI startup
## Notes
- The AMD repo and package installs are automated here, but BIOS and physical
host setup remain manual.
- ComfyUI itself is deployed as an application under `/home/comfy/ComfyUI`.
- The `comfyui.service` unit is installed but left disabled so the service is
started on demand.
- Models, LoRAs, VAEs, outputs, and other AI assets are not restored by this
automation. `rigby` is the source of truth for that data, so disaster
recovery for models requires a separate backup strategy.
- The `just` entrypoints accept `HOST=<ip>` so recovery does not depend on a
fixed DHCP lease.
- Recovery installs the configured SSH key for `bryan`.
- Static IP configuration is applied at the end of the playbook via netplan.
The SSH session used for recovery may be interrupted once the new address is
applied, and subsequent access should use the final static IP.

9
external/rigby/ansible.cfg vendored Normal file
View file

@ -0,0 +1,9 @@
[defaults]
inventory = inventory.ini
host_key_checking = False
stdout_callback = yaml
retry_files_enabled = False
interpreter_python = auto_silent
[ssh_connection]
pipelining = True

2
external/rigby/inventory.ini vendored Normal file
View file

@ -0,0 +1,2 @@
[ai_rig]
rigby ansible_host=192.168.0.23 ansible_user=bryan

306
external/rigby/playbooks/recover.yml vendored Normal file
View file

@ -0,0 +1,306 @@
---
- name: Recover rigby AI rig
hosts: ai_rig
become: true
vars:
rigby_user: bryan
rigby_recovery_ssh_keys:
- ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAACAQDl4895aB9P5p/lp8Hq5rHun4clvhyTSHFi3U2d6OOBoW5Fm+VcQnW/xbjmCBsXk5BdiowsBxQhwnzdfz/KJL7J5RobomUEaVRwb9UwT88eJveLp14BG8j2J3SjfyhrCX+4jkPx0bPQk1HGcuYY+tPEXf1q/ps88Dhu0CARBIzYQOTYY6b1qWzxpDoFZGHjKG8g5iY6FIu65yKKvvVy1f8IgZ3l3IpwBWVamxgkTcYY0QYSrmzo1n7TXxwrWbvenAqBsQ0cBPs+gVa3uIr+1TJl0Az5SElBVGu3LvUdlk58trtPUj6TQR3YUkg7Vjll7WHOdqhux5ZQNhjkOsHerf0Tw86e6cEzgeTuIbQHIb0LcsUunwKcuh2+au7RO599cvHn0+xZE5MZBxloDDaJ3JsiliM8kyPP/U3ERj03cWLW7BqbT+sfjAOl21RCzk0iQxk1wt/8VmtCr9Adv7IyrtaYvf/bwRP+g+9ldmzKGt8Mdb605uVzZ70H/LLm17f40Te+QHaex5by/6p6cuwEEZtgIg53Wpglu0rA6UxrBfQEHKl/Jt3FLeE0mnEyYkkR2MnHNtyWRIXtuqYZMAm2Ub1pFHH7jQV1gGiDVTw6a2eIwK21a/hXtRjFUpFd1nB1n+KNfJBE4zT3wm3Ud7mKw/6rWnoRyhYZvGXkFdp+iEs49Q== itme-brain@github/78120816
rigby_static_network_enabled: true
rigby_interface: eno1
rigby_static_ip: 192.168.0.23/24
rigby_gateway: 192.168.0.1
rigby_dns:
- 192.168.0.1
- 1.1.1.1
comfy_user: comfy
comfy_group: comfy
comfy_home: /home/comfy
comfy_root: /home/comfy/ComfyUI
comfy_venv: /home/comfy/comfy-venv
comfy_python_version: "3.13"
comfy_port: 8188
comfy_output_dir: /home/comfy/ComfyUI/output
comfy_repo_url: https://github.com/comfy-org/ComfyUI
comfy_repo_version: a1344238901efc5ea199d8094cb16fca36ceb28b
comfy_manager_version: "4.1"
comfy_torch_index_url: https://download.pytorch.org/whl/rocm7.2
grub_cmdline_linux_default: "amdgpu.cwsr_enable=0"
amd_driver_deb: amdgpu-install_7.2.1.70201-1_all.deb
amd_driver_url: https://repo.radeon.com/amdgpu-install/7.2.1/ubuntu/noble/amdgpu-install_7.2.1.70201-1_all.deb
rigby_packages:
- curl
- git
- rsync
- software-properties-common
- python-is-python3
- python3.13
- python3.13-venv
- python3.13-dev
- build-essential
- linux-headers-{{ ansible_kernel }}
- linux-modules-extra-{{ ansible_kernel }}
- samba
tasks:
- name: Ensure deadsnakes PPA is configured
ansible.builtin.apt_repository:
repo: ppa:deadsnakes/ppa
state: present
update_cache: true
- name: Install required Ubuntu packages
ansible.builtin.apt:
name: "{{ rigby_packages }}"
state: present
update_cache: true
- name: Ensure AMD installer package is present
ansible.builtin.get_url:
url: "{{ amd_driver_url }}"
dest: "/tmp/{{ amd_driver_deb }}"
mode: "0644"
- name: Install AMD installer package
ansible.builtin.apt:
deb: "/tmp/{{ amd_driver_deb }}"
state: present
- name: Install AMD GPU DKMS driver
ansible.builtin.apt:
name: amdgpu-dkms
state: present
update_cache: true
- name: Install ROCm stack
ansible.builtin.apt:
name: rocm
state: present
- name: Ensure required groups exist
ansible.builtin.group:
name: "{{ item }}"
state: present
loop:
- render
- video
- "{{ comfy_group }}"
- name: Ensure comfy user exists
ansible.builtin.user:
name: "{{ comfy_user }}"
group: "{{ comfy_group }}"
groups:
- render
- video
append: true
create_home: true
shell: /bin/bash
- name: Ensure bryan is in required groups
ansible.builtin.user:
name: "{{ rigby_user }}"
groups:
- render
- video
- "{{ comfy_group }}"
append: true
- name: Ensure recovery SSH keys are present for bryan
ansible.posix.authorized_key:
user: "{{ rigby_user }}"
state: present
key: "{{ item }}"
loop: "{{ rigby_recovery_ssh_keys }}"
- name: Configure GRUB default kernel args
ansible.builtin.lineinfile:
path: /etc/default/grub
regexp: '^GRUB_CMDLINE_LINUX_DEFAULT='
line: 'GRUB_CMDLINE_LINUX_DEFAULT="{{ grub_cmdline_linux_default }}"'
- name: Ensure GRUB menu is shown
ansible.builtin.lineinfile:
path: /etc/default/grub
regexp: '^{{ item.key }}='
line: "{{ item.key }}={{ item.value }}"
loop:
- { key: GRUB_TIMEOUT_STYLE, value: "menu" }
- { key: GRUB_TIMEOUT, value: "5" }
- name: Regenerate grub config
ansible.builtin.command: update-grub
changed_when: true
- name: Ensure Comfy directories exist
ansible.builtin.file:
path: "{{ item.path }}"
state: directory
owner: "{{ comfy_user }}"
group: "{{ comfy_group }}"
mode: "{{ item.mode }}"
loop:
- { path: "{{ comfy_home }}", mode: "0775" }
- { path: "{{ comfy_root }}", mode: "0775" }
- { path: "{{ comfy_output_dir }}", mode: "2775" }
- { path: "{{ comfy_home }}/.local/bin", mode: "0775" }
- { path: "{{ comfy_home }}/piptmp", mode: "0775" }
- name: Ensure uv is installed for comfy
ansible.builtin.shell: |
set -euo pipefail
curl -LsSf https://astral.sh/uv/install.sh | sh
args:
creates: "{{ comfy_home }}/.local/bin/uv"
become_user: "{{ comfy_user }}"
- name: Ensure ComfyUI repo is present at pinned revision
ansible.builtin.git:
repo: "{{ comfy_repo_url }}"
dest: "{{ comfy_root }}"
version: "{{ comfy_repo_version }}"
update: true
become_user: "{{ comfy_user }}"
- name: Ensure ComfyUI venv exists
ansible.builtin.command:
argv:
- "{{ comfy_home }}/.local/bin/uv"
- venv
- --python
- "{{ comfy_python_version }}"
- "{{ comfy_venv }}"
args:
creates: "{{ comfy_venv }}/bin/python"
become_user: "{{ comfy_user }}"
- name: Install base Python packaging tools in Comfy venv
ansible.builtin.command:
argv:
- "{{ comfy_home }}/.local/bin/uv"
- pip
- install
- --python
- "{{ comfy_venv }}/bin/python"
- --upgrade
- pip
- setuptools
- wheel
become_user: "{{ comfy_user }}"
- name: Install ROCm PyTorch in Comfy venv
ansible.builtin.command:
argv:
- "{{ comfy_home }}/.local/bin/uv"
- pip
- install
- --python
- "{{ comfy_venv }}/bin/python"
- --index-url
- "{{ comfy_torch_index_url }}"
- torch
- torchvision
- torchaudio
environment:
TMPDIR: "{{ comfy_home }}/piptmp"
become_user: "{{ comfy_user }}"
- name: Install ComfyUI requirements in Comfy venv
ansible.builtin.command:
argv:
- "{{ comfy_home }}/.local/bin/uv"
- pip
- install
- --python
- "{{ comfy_venv }}/bin/python"
- -r
- "{{ comfy_root }}/requirements.txt"
environment:
TMPDIR: "{{ comfy_home }}/piptmp"
become_user: "{{ comfy_user }}"
- name: Install ComfyUI-Manager in Comfy venv
ansible.builtin.command:
argv:
- "{{ comfy_home }}/.local/bin/uv"
- pip
- install
- --python
- "{{ comfy_venv }}/bin/python"
- "comfyui-manager=={{ comfy_manager_version }}"
environment:
TMPDIR: "{{ comfy_home }}/piptmp"
become_user: "{{ comfy_user }}"
- name: Ensure output directories have group inheritance
ansible.builtin.shell: |
set -euo pipefail
find "{{ comfy_output_dir }}" -type d -exec chown {{ comfy_user }}:{{ comfy_group }} {} +
find "{{ comfy_output_dir }}" -type d -exec chmod 2775 {} +
changed_when: true
- name: Ensure output files are group writable
ansible.builtin.shell: |
set -euo pipefail
find "{{ comfy_output_dir }}" -type f -exec chown {{ comfy_user }}:{{ comfy_group }} {} +
find "{{ comfy_output_dir }}" -type f -exec chmod 0664 {} +
changed_when: true
- name: Install ComfyUI systemd unit
ansible.builtin.template:
src: ../templates/comfyui.service.j2
dest: /etc/systemd/system/comfyui.service
owner: root
group: root
mode: "0644"
- name: Ensure Samba include directory exists
ansible.builtin.file:
path: /etc/samba/smb.conf.d
state: directory
owner: root
group: root
mode: "0755"
- name: Install Samba share config for Comfy outputs
ansible.builtin.template:
src: ../templates/comfy-output.conf.j2
dest: /etc/samba/smb.conf.d/comfy-output.conf
owner: root
group: root
mode: "0644"
- name: Ensure Samba includes conf.d snippets
ansible.builtin.blockinfile:
path: /etc/samba/smb.conf
marker: "; {mark} ANSIBLE MANAGED COMFY OUTPUT INCLUDE"
block: |
include = /etc/samba/smb.conf.d/comfy-output.conf
- name: Reload systemd
ansible.builtin.systemd_service:
daemon_reload: true
- name: Ensure ComfyUI service is installed but disabled
ansible.builtin.systemd_service:
name: comfyui.service
enabled: false
- name: Ensure Samba service is enabled and running
ansible.builtin.systemd_service:
name: smbd.service
enabled: true
state: started
- name: Install netplan static IP config for rigby
ansible.builtin.template:
src: ../templates/99-rigby-static.yaml.j2
dest: /etc/netplan/99-rigby-static.yaml
owner: root
group: root
mode: "0644"
when: rigby_static_network_enabled | bool
- name: Apply static netplan configuration as final step
ansible.builtin.command: netplan apply
when: rigby_static_network_enabled | bool
changed_when: true

View file

@ -0,0 +1,16 @@
network:
version: 2
renderer: networkd
ethernets:
{{ rigby_interface }}:
dhcp4: false
addresses:
- {{ rigby_static_ip }}
routes:
- to: default
via: {{ rigby_gateway }}
nameservers:
addresses:
{% for dns in rigby_dns %}
- {{ dns }}
{% endfor %}

View file

@ -0,0 +1,9 @@
[comfy-output]
path = {{ comfy_output_dir }}
browseable = yes
read only = no
guest ok = yes
force user = {{ comfy_user }}
force group = {{ comfy_group }}
create mask = 0664
directory mask = 2775

View file

@ -0,0 +1,22 @@
[Unit]
Description=ComfyUI
After=network-online.target
Wants=network-online.target
[Service]
Type=simple
User={{ comfy_user }}
Group={{ comfy_group }}
UMask=0002
WorkingDirectory={{ comfy_root }}
Environment=HOME={{ comfy_home }}
Environment=COMFYUI_PATH={{ comfy_root }}
Environment=PATH={{ comfy_home }}/.local/bin:{{ comfy_venv }}/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
ExecStart={{ comfy_venv }}/bin/python {{ comfy_root }}/main.py --highvram --enable-manager --listen 0.0.0.0 --port {{ comfy_port }} --disable-auto-launch
Restart=on-failure
RestartSec=5
NoNewPrivileges=true
PrivateTmp=true
[Install]
WantedBy=multi-user.target

View file

@ -71,6 +71,7 @@
packages = [
just
rclone
ansible
age
sops

View file

@ -1,10 +1,27 @@
SYSTEM := "$(echo $HOSTNAME)"
VALID_SYSTEMS := "desktop server wsl"
RIGBY_DIR := "external/rigby"
RIGBY_HOST := "192.168.0.23"
# Print this list
default:
@just --list
# Verify SSH connectivity and Ansible access to the Ubuntu AI rig.
[group('rigby')]
rigby-check HOST=RIGBY_HOST:
@cd {{RIGBY_DIR}} && ansible -i "{{HOST}}," all -u bryan -m ping
# Apply the disaster-recovery playbook for the Ubuntu AI rig.
[group('rigby')]
rigby-recover HOST=RIGBY_HOST:
@cd {{RIGBY_DIR}} && ansible-playbook -i "{{HOST}}," -u bryan playbooks/recover.yml
# Preview rig recovery changes without modifying the target host.
[group('rigby')]
rigby-recover-dry-run HOST=RIGBY_HOST:
@cd {{RIGBY_DIR}} && ansible-playbook -i "{{HOST}}," -u bryan playbooks/recover.yml --check --diff
# Validate system argument
[private]
_validate SYSTEM:

View file

@ -34,6 +34,23 @@
};
};
systemd.user.services.comfy-mount = {
Unit = {
Description = "Mount ComfyUI outputs via SSHFS";
After = [ "network-online.target" ];
};
Service = {
Type = "oneshot";
RemainAfterExit = true;
ExecStartPre = "${pkgs.coreutils}/bin/mkdir -p %h/Media/Comfy";
ExecStart = "${pkgs.sshfs}/bin/sshfs -o reconnect,ServerAliveInterval=15,ServerAliveCountMax=3 rigby:/home/comfy/ComfyUI/output %h/Media/Comfy";
ExecStop = "${pkgs.fuse}/bin/fusermount -u %h/Media/Comfy";
};
Install = {
WantedBy = [ "default.target" ];
};
};
programs.ssh = {
enable = true;
enableDefaultConfig = false;
@ -46,6 +63,10 @@
hostname = "192.168.0.154";
user = "bryan";
};
"rigby" = {
hostname = "192.168.0.23";
user = "bryan";
};
};
};

View file

@ -72,6 +72,7 @@ in
environment = {
systemPackages = with pkgs; [
ansible
vim
git
usbutils