diff --git a/external/README.md b/external/README.md new file mode 100644 index 0000000..3ceaaf9 --- /dev/null +++ b/external/README.md @@ -0,0 +1,6 @@ +# External Automation + +This directory contains automation for systems that are not managed as NixOS +hosts inside this repository. + +- `rigby/`: Ubuntu-based AI rig recovery and service automation. diff --git a/external/rigby/README.md b/external/rigby/README.md new file mode 100644 index 0000000..2fe3d06 --- /dev/null +++ b/external/rigby/README.md @@ -0,0 +1,64 @@ +# Rigby Recovery + +This directory contains disaster-recovery automation for `rigby`, the Ubuntu +AI rig at `192.168.0.23`. + +## Scope + +This automation manages the host state after a manual Ubuntu install. + +It is intended to restore the working state we validated for: + +- AMD ROCm `7.2.1` +- `amdgpu-dkms` +- `amdgpu.cwsr_enable=0` +- pinned ComfyUI checkout +- `uv`-managed Python `3.13` venv +- ROCm PyTorch +- ComfyUI service layout +- output sharing over Samba +- required groups and permissions + +## Manual Prerequisites + +These are intentionally documented, not automated: + +- Install Ubuntu `24.04.4` +- Update BIOS to the known-good version for the board +- Verify BIOS settings: + - `Above 4G Decoding = Enabled` + - `SVM = Enabled` + - UEFI boot + - sane PCIe slot configuration +- Ensure host SSH is reachable as `bryan` +- Ensure passwordless sudo works for `bryan` +- Ensure the initial DHCP lease is known so recovery can begin + +## Recovery Flow + +1. Install Ubuntu manually. +2. Clone this repository onto the operator machine. +3. From the repo root, run `just rigby-check HOST=`. +4. Run `just rigby-recover HOST=`. +5. Reboot `rigby`. +6. Validate: + - `rocminfo` + - `rocm-smi` + - ComfyUI startup + +## Notes + +- The AMD repo and package installs are automated here, but BIOS and physical + host setup remain manual. +- ComfyUI itself is deployed as an application under `/home/comfy/ComfyUI`. +- The `comfyui.service` unit is installed but left disabled so the service is + started on demand. +- Models, LoRAs, VAEs, outputs, and other AI assets are not restored by this + automation. `rigby` is the source of truth for that data, so disaster + recovery for models requires a separate backup strategy. +- The `just` entrypoints accept `HOST=` so recovery does not depend on a + fixed DHCP lease. +- Recovery installs the configured SSH key for `bryan`. +- Static IP configuration is applied at the end of the playbook via netplan. + The SSH session used for recovery may be interrupted once the new address is + applied, and subsequent access should use the final static IP. diff --git a/external/rigby/ansible.cfg b/external/rigby/ansible.cfg new file mode 100644 index 0000000..0e18eec --- /dev/null +++ b/external/rigby/ansible.cfg @@ -0,0 +1,9 @@ +[defaults] +inventory = inventory.ini +host_key_checking = False +stdout_callback = yaml +retry_files_enabled = False +interpreter_python = auto_silent + +[ssh_connection] +pipelining = True diff --git a/external/rigby/inventory.ini b/external/rigby/inventory.ini new file mode 100644 index 0000000..cb7f2b4 --- /dev/null +++ b/external/rigby/inventory.ini @@ -0,0 +1,2 @@ +[ai_rig] +rigby ansible_host=192.168.0.23 ansible_user=bryan diff --git a/external/rigby/playbooks/recover.yml b/external/rigby/playbooks/recover.yml new file mode 100644 index 0000000..fb9cf16 --- /dev/null +++ b/external/rigby/playbooks/recover.yml @@ -0,0 +1,306 @@ +--- +- name: Recover rigby AI rig + hosts: ai_rig + become: true + vars: + rigby_user: bryan + rigby_recovery_ssh_keys: + - ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAACAQDl4895aB9P5p/lp8Hq5rHun4clvhyTSHFi3U2d6OOBoW5Fm+VcQnW/xbjmCBsXk5BdiowsBxQhwnzdfz/KJL7J5RobomUEaVRwb9UwT88eJveLp14BG8j2J3SjfyhrCX+4jkPx0bPQk1HGcuYY+tPEXf1q/ps88Dhu0CARBIzYQOTYY6b1qWzxpDoFZGHjKG8g5iY6FIu65yKKvvVy1f8IgZ3l3IpwBWVamxgkTcYY0QYSrmzo1n7TXxwrWbvenAqBsQ0cBPs+gVa3uIr+1TJl0Az5SElBVGu3LvUdlk58trtPUj6TQR3YUkg7Vjll7WHOdqhux5ZQNhjkOsHerf0Tw86e6cEzgeTuIbQHIb0LcsUunwKcuh2+au7RO599cvHn0+xZE5MZBxloDDaJ3JsiliM8kyPP/U3ERj03cWLW7BqbT+sfjAOl21RCzk0iQxk1wt/8VmtCr9Adv7IyrtaYvf/bwRP+g+9ldmzKGt8Mdb605uVzZ70H/LLm17f40Te+QHaex5by/6p6cuwEEZtgIg53Wpglu0rA6UxrBfQEHKl/Jt3FLeE0mnEyYkkR2MnHNtyWRIXtuqYZMAm2Ub1pFHH7jQV1gGiDVTw6a2eIwK21a/hXtRjFUpFd1nB1n+KNfJBE4zT3wm3Ud7mKw/6rWnoRyhYZvGXkFdp+iEs49Q== itme-brain@github/78120816 + rigby_static_network_enabled: true + rigby_interface: eno1 + rigby_static_ip: 192.168.0.23/24 + rigby_gateway: 192.168.0.1 + rigby_dns: + - 192.168.0.1 + - 1.1.1.1 + comfy_user: comfy + comfy_group: comfy + comfy_home: /home/comfy + comfy_root: /home/comfy/ComfyUI + comfy_venv: /home/comfy/comfy-venv + comfy_python_version: "3.13" + comfy_port: 8188 + comfy_output_dir: /home/comfy/ComfyUI/output + comfy_repo_url: https://github.com/comfy-org/ComfyUI + comfy_repo_version: a1344238901efc5ea199d8094cb16fca36ceb28b + comfy_manager_version: "4.1" + comfy_torch_index_url: https://download.pytorch.org/whl/rocm7.2 + grub_cmdline_linux_default: "amdgpu.cwsr_enable=0" + amd_driver_deb: amdgpu-install_7.2.1.70201-1_all.deb + amd_driver_url: https://repo.radeon.com/amdgpu-install/7.2.1/ubuntu/noble/amdgpu-install_7.2.1.70201-1_all.deb + rigby_packages: + - curl + - git + - rsync + - software-properties-common + - python-is-python3 + - python3.13 + - python3.13-venv + - python3.13-dev + - build-essential + - linux-headers-{{ ansible_kernel }} + - linux-modules-extra-{{ ansible_kernel }} + - samba + tasks: + - name: Ensure deadsnakes PPA is configured + ansible.builtin.apt_repository: + repo: ppa:deadsnakes/ppa + state: present + update_cache: true + + - name: Install required Ubuntu packages + ansible.builtin.apt: + name: "{{ rigby_packages }}" + state: present + update_cache: true + + - name: Ensure AMD installer package is present + ansible.builtin.get_url: + url: "{{ amd_driver_url }}" + dest: "/tmp/{{ amd_driver_deb }}" + mode: "0644" + + - name: Install AMD installer package + ansible.builtin.apt: + deb: "/tmp/{{ amd_driver_deb }}" + state: present + + - name: Install AMD GPU DKMS driver + ansible.builtin.apt: + name: amdgpu-dkms + state: present + update_cache: true + + - name: Install ROCm stack + ansible.builtin.apt: + name: rocm + state: present + + - name: Ensure required groups exist + ansible.builtin.group: + name: "{{ item }}" + state: present + loop: + - render + - video + - "{{ comfy_group }}" + + - name: Ensure comfy user exists + ansible.builtin.user: + name: "{{ comfy_user }}" + group: "{{ comfy_group }}" + groups: + - render + - video + append: true + create_home: true + shell: /bin/bash + + - name: Ensure bryan is in required groups + ansible.builtin.user: + name: "{{ rigby_user }}" + groups: + - render + - video + - "{{ comfy_group }}" + append: true + + - name: Ensure recovery SSH keys are present for bryan + ansible.posix.authorized_key: + user: "{{ rigby_user }}" + state: present + key: "{{ item }}" + loop: "{{ rigby_recovery_ssh_keys }}" + + - name: Configure GRUB default kernel args + ansible.builtin.lineinfile: + path: /etc/default/grub + regexp: '^GRUB_CMDLINE_LINUX_DEFAULT=' + line: 'GRUB_CMDLINE_LINUX_DEFAULT="{{ grub_cmdline_linux_default }}"' + + - name: Ensure GRUB menu is shown + ansible.builtin.lineinfile: + path: /etc/default/grub + regexp: '^{{ item.key }}=' + line: "{{ item.key }}={{ item.value }}" + loop: + - { key: GRUB_TIMEOUT_STYLE, value: "menu" } + - { key: GRUB_TIMEOUT, value: "5" } + + - name: Regenerate grub config + ansible.builtin.command: update-grub + changed_when: true + + - name: Ensure Comfy directories exist + ansible.builtin.file: + path: "{{ item.path }}" + state: directory + owner: "{{ comfy_user }}" + group: "{{ comfy_group }}" + mode: "{{ item.mode }}" + loop: + - { path: "{{ comfy_home }}", mode: "0775" } + - { path: "{{ comfy_root }}", mode: "0775" } + - { path: "{{ comfy_output_dir }}", mode: "2775" } + - { path: "{{ comfy_home }}/.local/bin", mode: "0775" } + - { path: "{{ comfy_home }}/piptmp", mode: "0775" } + + - name: Ensure uv is installed for comfy + ansible.builtin.shell: | + set -euo pipefail + curl -LsSf https://astral.sh/uv/install.sh | sh + args: + creates: "{{ comfy_home }}/.local/bin/uv" + become_user: "{{ comfy_user }}" + + - name: Ensure ComfyUI repo is present at pinned revision + ansible.builtin.git: + repo: "{{ comfy_repo_url }}" + dest: "{{ comfy_root }}" + version: "{{ comfy_repo_version }}" + update: true + become_user: "{{ comfy_user }}" + + - name: Ensure ComfyUI venv exists + ansible.builtin.command: + argv: + - "{{ comfy_home }}/.local/bin/uv" + - venv + - --python + - "{{ comfy_python_version }}" + - "{{ comfy_venv }}" + args: + creates: "{{ comfy_venv }}/bin/python" + become_user: "{{ comfy_user }}" + + - name: Install base Python packaging tools in Comfy venv + ansible.builtin.command: + argv: + - "{{ comfy_home }}/.local/bin/uv" + - pip + - install + - --python + - "{{ comfy_venv }}/bin/python" + - --upgrade + - pip + - setuptools + - wheel + become_user: "{{ comfy_user }}" + + - name: Install ROCm PyTorch in Comfy venv + ansible.builtin.command: + argv: + - "{{ comfy_home }}/.local/bin/uv" + - pip + - install + - --python + - "{{ comfy_venv }}/bin/python" + - --index-url + - "{{ comfy_torch_index_url }}" + - torch + - torchvision + - torchaudio + environment: + TMPDIR: "{{ comfy_home }}/piptmp" + become_user: "{{ comfy_user }}" + + - name: Install ComfyUI requirements in Comfy venv + ansible.builtin.command: + argv: + - "{{ comfy_home }}/.local/bin/uv" + - pip + - install + - --python + - "{{ comfy_venv }}/bin/python" + - -r + - "{{ comfy_root }}/requirements.txt" + environment: + TMPDIR: "{{ comfy_home }}/piptmp" + become_user: "{{ comfy_user }}" + + - name: Install ComfyUI-Manager in Comfy venv + ansible.builtin.command: + argv: + - "{{ comfy_home }}/.local/bin/uv" + - pip + - install + - --python + - "{{ comfy_venv }}/bin/python" + - "comfyui-manager=={{ comfy_manager_version }}" + environment: + TMPDIR: "{{ comfy_home }}/piptmp" + become_user: "{{ comfy_user }}" + + - name: Ensure output directories have group inheritance + ansible.builtin.shell: | + set -euo pipefail + find "{{ comfy_output_dir }}" -type d -exec chown {{ comfy_user }}:{{ comfy_group }} {} + + find "{{ comfy_output_dir }}" -type d -exec chmod 2775 {} + + changed_when: true + + - name: Ensure output files are group writable + ansible.builtin.shell: | + set -euo pipefail + find "{{ comfy_output_dir }}" -type f -exec chown {{ comfy_user }}:{{ comfy_group }} {} + + find "{{ comfy_output_dir }}" -type f -exec chmod 0664 {} + + changed_when: true + + - name: Install ComfyUI systemd unit + ansible.builtin.template: + src: ../templates/comfyui.service.j2 + dest: /etc/systemd/system/comfyui.service + owner: root + group: root + mode: "0644" + + - name: Ensure Samba include directory exists + ansible.builtin.file: + path: /etc/samba/smb.conf.d + state: directory + owner: root + group: root + mode: "0755" + + - name: Install Samba share config for Comfy outputs + ansible.builtin.template: + src: ../templates/comfy-output.conf.j2 + dest: /etc/samba/smb.conf.d/comfy-output.conf + owner: root + group: root + mode: "0644" + + - name: Ensure Samba includes conf.d snippets + ansible.builtin.blockinfile: + path: /etc/samba/smb.conf + marker: "; {mark} ANSIBLE MANAGED COMFY OUTPUT INCLUDE" + block: | + include = /etc/samba/smb.conf.d/comfy-output.conf + + - name: Reload systemd + ansible.builtin.systemd_service: + daemon_reload: true + + - name: Ensure ComfyUI service is installed but disabled + ansible.builtin.systemd_service: + name: comfyui.service + enabled: false + + - name: Ensure Samba service is enabled and running + ansible.builtin.systemd_service: + name: smbd.service + enabled: true + state: started + + - name: Install netplan static IP config for rigby + ansible.builtin.template: + src: ../templates/99-rigby-static.yaml.j2 + dest: /etc/netplan/99-rigby-static.yaml + owner: root + group: root + mode: "0644" + when: rigby_static_network_enabled | bool + + - name: Apply static netplan configuration as final step + ansible.builtin.command: netplan apply + when: rigby_static_network_enabled | bool + changed_when: true diff --git a/external/rigby/templates/99-rigby-static.yaml.j2 b/external/rigby/templates/99-rigby-static.yaml.j2 new file mode 100644 index 0000000..209d127 --- /dev/null +++ b/external/rigby/templates/99-rigby-static.yaml.j2 @@ -0,0 +1,16 @@ +network: + version: 2 + renderer: networkd + ethernets: + {{ rigby_interface }}: + dhcp4: false + addresses: + - {{ rigby_static_ip }} + routes: + - to: default + via: {{ rigby_gateway }} + nameservers: + addresses: +{% for dns in rigby_dns %} + - {{ dns }} +{% endfor %} diff --git a/external/rigby/templates/comfy-output.conf.j2 b/external/rigby/templates/comfy-output.conf.j2 new file mode 100644 index 0000000..32a7c86 --- /dev/null +++ b/external/rigby/templates/comfy-output.conf.j2 @@ -0,0 +1,9 @@ +[comfy-output] + path = {{ comfy_output_dir }} + browseable = yes + read only = no + guest ok = yes + force user = {{ comfy_user }} + force group = {{ comfy_group }} + create mask = 0664 + directory mask = 2775 diff --git a/external/rigby/templates/comfyui.service.j2 b/external/rigby/templates/comfyui.service.j2 new file mode 100644 index 0000000..aa0ac7d --- /dev/null +++ b/external/rigby/templates/comfyui.service.j2 @@ -0,0 +1,22 @@ +[Unit] +Description=ComfyUI +After=network-online.target +Wants=network-online.target + +[Service] +Type=simple +User={{ comfy_user }} +Group={{ comfy_group }} +UMask=0002 +WorkingDirectory={{ comfy_root }} +Environment=HOME={{ comfy_home }} +Environment=COMFYUI_PATH={{ comfy_root }} +Environment=PATH={{ comfy_home }}/.local/bin:{{ comfy_venv }}/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin +ExecStart={{ comfy_venv }}/bin/python {{ comfy_root }}/main.py --highvram --enable-manager --listen 0.0.0.0 --port {{ comfy_port }} --disable-auto-launch +Restart=on-failure +RestartSec=5 +NoNewPrivileges=true +PrivateTmp=true + +[Install] +WantedBy=multi-user.target diff --git a/flake.nix b/flake.nix index ff6d08b..76b1cc9 100644 --- a/flake.nix +++ b/flake.nix @@ -71,6 +71,7 @@ packages = [ just rclone + ansible age sops diff --git a/justfile b/justfile index f2aea1d..2869dcb 100644 --- a/justfile +++ b/justfile @@ -1,10 +1,27 @@ SYSTEM := "$(echo $HOSTNAME)" VALID_SYSTEMS := "desktop server wsl" +RIGBY_DIR := "external/rigby" +RIGBY_HOST := "192.168.0.23" # Print this list default: @just --list +# Verify SSH connectivity and Ansible access to the Ubuntu AI rig. +[group('rigby')] +rigby-check HOST=RIGBY_HOST: + @cd {{RIGBY_DIR}} && ansible -i "{{HOST}}," all -u bryan -m ping + +# Apply the disaster-recovery playbook for the Ubuntu AI rig. +[group('rigby')] +rigby-recover HOST=RIGBY_HOST: + @cd {{RIGBY_DIR}} && ansible-playbook -i "{{HOST}}," -u bryan playbooks/recover.yml + +# Preview rig recovery changes without modifying the target host. +[group('rigby')] +rigby-recover-dry-run HOST=RIGBY_HOST: + @cd {{RIGBY_DIR}} && ansible-playbook -i "{{HOST}}," -u bryan playbooks/recover.yml --check --diff + # Validate system argument [private] _validate SYSTEM: diff --git a/system/machines/desktop/modules/home-manager/home.nix b/system/machines/desktop/modules/home-manager/home.nix index 59b2299..41e88bc 100644 --- a/system/machines/desktop/modules/home-manager/home.nix +++ b/system/machines/desktop/modules/home-manager/home.nix @@ -34,6 +34,23 @@ }; }; + systemd.user.services.comfy-mount = { + Unit = { + Description = "Mount ComfyUI outputs via SSHFS"; + After = [ "network-online.target" ]; + }; + Service = { + Type = "oneshot"; + RemainAfterExit = true; + ExecStartPre = "${pkgs.coreutils}/bin/mkdir -p %h/Media/Comfy"; + ExecStart = "${pkgs.sshfs}/bin/sshfs -o reconnect,ServerAliveInterval=15,ServerAliveCountMax=3 rigby:/home/comfy/ComfyUI/output %h/Media/Comfy"; + ExecStop = "${pkgs.fuse}/bin/fusermount -u %h/Media/Comfy"; + }; + Install = { + WantedBy = [ "default.target" ]; + }; + }; + programs.ssh = { enable = true; enableDefaultConfig = false; @@ -46,6 +63,10 @@ hostname = "192.168.0.154"; user = "bryan"; }; + "rigby" = { + hostname = "192.168.0.23"; + user = "bryan"; + }; }; }; diff --git a/system/machines/desktop/system.nix b/system/machines/desktop/system.nix index 86d1aa5..ca2c24b 100644 --- a/system/machines/desktop/system.nix +++ b/system/machines/desktop/system.nix @@ -72,6 +72,7 @@ in environment = { systemPackages = with pkgs; [ + ansible vim git usbutils