This commit is contained in:
Bryan Ramos 2026-04-15 20:58:07 -04:00
commit 864c69fe61
147 changed files with 11233 additions and 0 deletions

64
external/rigby/README.md vendored Normal file
View file

@ -0,0 +1,64 @@
# Rigby Recovery
This directory contains disaster-recovery automation for `rigby`, the Ubuntu
AI rig at `192.168.0.23`.
## Scope
This automation manages the host state after a manual Ubuntu install.
It is intended to restore the working state we validated for:
- AMD ROCm `7.2.1`
- `amdgpu-dkms`
- `amdgpu.cwsr_enable=0`
- pinned ComfyUI checkout
- `uv`-managed Python `3.13` venv
- ROCm PyTorch
- ComfyUI service layout
- output sharing over Samba
- required groups and permissions
## Manual Prerequisites
These are intentionally documented, not automated:
- Install Ubuntu `24.04.4`
- Update BIOS to the known-good version for the board
- Verify BIOS settings:
- `Above 4G Decoding = Enabled`
- `SVM = Enabled`
- UEFI boot
- sane PCIe slot configuration
- Ensure host SSH is reachable as `bryan`
- Ensure passwordless sudo works for `bryan`
- Ensure the initial DHCP lease is known so recovery can begin
## Recovery Flow
1. Install Ubuntu manually.
2. Clone this repository onto the operator machine.
3. From the repo root, run `just rigby-check HOST=<rigby-ip>`.
4. Run `just rigby-recover HOST=<rigby-ip>`.
5. Reboot `rigby`.
6. Validate:
- `rocminfo`
- `rocm-smi`
- ComfyUI startup
## Notes
- The AMD repo and package installs are automated here, but BIOS and physical
host setup remain manual.
- ComfyUI itself is deployed as an application under `/home/comfy/ComfyUI`.
- The `comfyui.service` unit is installed but left disabled so the service is
started on demand.
- Models, LoRAs, VAEs, outputs, and other AI assets are not restored by this
automation. `rigby` is the source of truth for that data, so disaster
recovery for models requires a separate backup strategy.
- The `just` entrypoints accept `HOST=<ip>` so recovery does not depend on a
fixed DHCP lease.
- Recovery installs the configured SSH key for `bryan`.
- Static IP configuration is applied at the end of the playbook via netplan.
The SSH session used for recovery may be interrupted once the new address is
applied, and subsequent access should use the final static IP.

9
external/rigby/ansible.cfg vendored Normal file
View file

@ -0,0 +1,9 @@
[defaults]
inventory = inventory.ini
host_key_checking = False
stdout_callback = yaml
retry_files_enabled = False
interpreter_python = auto_silent
[ssh_connection]
pipelining = True

2
external/rigby/inventory.ini vendored Normal file
View file

@ -0,0 +1,2 @@
[ai_rig]
rigby ansible_host=192.168.0.23 ansible_user=bryan

442
external/rigby/playbooks/recover.yml vendored Normal file
View file

@ -0,0 +1,442 @@
---
- name: Recover rigby AI rig
hosts: ai_rig
become: true
vars:
rigby_user: bryan
rigby_recovery_ssh_keys:
- ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAACAQDl4895aB9P5p/lp8Hq5rHun4clvhyTSHFi3U2d6OOBoW5Fm+VcQnW/xbjmCBsXk5BdiowsBxQhwnzdfz/KJL7J5RobomUEaVRwb9UwT88eJveLp14BG8j2J3SjfyhrCX+4jkPx0bPQk1HGcuYY+tPEXf1q/ps88Dhu0CARBIzYQOTYY6b1qWzxpDoFZGHjKG8g5iY6FIu65yKKvvVy1f8IgZ3l3IpwBWVamxgkTcYY0QYSrmzo1n7TXxwrWbvenAqBsQ0cBPs+gVa3uIr+1TJl0Az5SElBVGu3LvUdlk58trtPUj6TQR3YUkg7Vjll7WHOdqhux5ZQNhjkOsHerf0Tw86e6cEzgeTuIbQHIb0LcsUunwKcuh2+au7RO599cvHn0+xZE5MZBxloDDaJ3JsiliM8kyPP/U3ERj03cWLW7BqbT+sfjAOl21RCzk0iQxk1wt/8VmtCr9Adv7IyrtaYvf/bwRP+g+9ldmzKGt8Mdb605uVzZ70H/LLm17f40Te+QHaex5by/6p6cuwEEZtgIg53Wpglu0rA6UxrBfQEHKl/Jt3FLeE0mnEyYkkR2MnHNtyWRIXtuqYZMAm2Ub1pFHH7jQV1gGiDVTw6a2eIwK21a/hXtRjFUpFd1nB1n+KNfJBE4zT3wm3Ud7mKw/6rWnoRyhYZvGXkFdp+iEs49Q== itme-brain@github/78120816
rigby_static_network_enabled: true
rigby_interface: eno1
rigby_static_ip: 192.168.0.23/24
rigby_gateway: 192.168.0.1
rigby_dns:
- 192.168.0.1
- 1.1.1.1
comfy_user: comfy
comfy_group: comfy
comfy_home: /home/comfy
comfy_root: /home/comfy/ComfyUI
comfy_venv: /home/comfy/comfy-venv
comfy_python_version: "3.13"
comfy_port: 8188
comfy_output_dir: /home/comfy/ComfyUI/output
comfy_repo_url: https://github.com/comfy-org/ComfyUI
comfy_repo_version: a1344238901efc5ea199d8094cb16fca36ceb28b
comfy_manager_version: "4.1"
comfy_torch_index_url: https://download.pytorch.org/whl/rocm7.2
grub_cmdline_linux_default: "amdgpu.cwsr_enable=0"
amd_driver_deb: amdgpu-install_7.2.1.70201-1_all.deb
amd_driver_url: https://repo.radeon.com/amdgpu-install/7.2.1/ubuntu/noble/amdgpu-install_7.2.1.70201-1_all.deb
rigby_packages:
- curl
- git
- rsync
- software-properties-common
- python-is-python3
- python3.13
- python3.13-venv
- python3.13-dev
- build-essential
- linux-headers-{{ ansible_kernel }}
- linux-modules-extra-{{ ansible_kernel }}
- samba
- just
- python3.12
- python3.12-venv
- docker.io
vllm_user: vllm
vllm_home: /home/vllm
vllm_venv: /home/vllm/vllm-venv
vllm_models: /home/vllm/models
vllm_port: 8000
vllm_gpu_memory_utilization: "0.95"
vllm_rocm_wheels_url: https://wheels.vllm.ai/rocm/0.19.0/rocm721
vllm_models_list:
- name: Qwen2.5-Coder-14B
recipe: coder
dir: Qwen2.5-Coder-14B-Instruct
max_model_len: 4096
- name: Qwen2.5-7B-Instruct
recipe: qwen7b
dir: Qwen2.5-7B-Instruct
max_model_len: 8192
tool_call_parser: hermes
librechat_root: /home/bryan/LibreChat
librechat_repo_url: https://github.com/danny-avila/LibreChat
tasks:
- name: Ensure deadsnakes PPA is configured
ansible.builtin.apt_repository:
repo: ppa:deadsnakes/ppa
state: present
update_cache: true
- name: Install required Ubuntu packages
ansible.builtin.apt:
name: "{{ rigby_packages }}"
state: present
update_cache: true
- name: Ensure AMD installer package is present
ansible.builtin.get_url:
url: "{{ amd_driver_url }}"
dest: "/tmp/{{ amd_driver_deb }}"
mode: "0644"
- name: Install AMD installer package
ansible.builtin.apt:
deb: "/tmp/{{ amd_driver_deb }}"
state: present
- name: Install AMD GPU DKMS driver
ansible.builtin.apt:
name: amdgpu-dkms
state: present
update_cache: true
- name: Install ROCm stack
ansible.builtin.apt:
name: rocm
state: present
- name: Ensure required groups exist
ansible.builtin.group:
name: "{{ item }}"
state: present
loop:
- render
- video
- "{{ comfy_group }}"
- name: Ensure comfy user exists
ansible.builtin.user:
name: "{{ comfy_user }}"
group: "{{ comfy_group }}"
groups:
- render
- video
append: true
create_home: true
shell: /bin/bash
- name: Ensure bryan is in required groups
ansible.builtin.user:
name: "{{ rigby_user }}"
groups:
- render
- video
- "{{ comfy_group }}"
append: true
- name: Ensure recovery SSH keys are present for bryan
ansible.posix.authorized_key:
user: "{{ rigby_user }}"
state: present
key: "{{ item }}"
loop: "{{ rigby_recovery_ssh_keys }}"
- name: Configure GRUB default kernel args
ansible.builtin.lineinfile:
path: /etc/default/grub
regexp: '^GRUB_CMDLINE_LINUX_DEFAULT='
line: 'GRUB_CMDLINE_LINUX_DEFAULT="{{ grub_cmdline_linux_default }}"'
- name: Ensure GRUB menu is shown
ansible.builtin.lineinfile:
path: /etc/default/grub
regexp: '^{{ item.key }}='
line: "{{ item.key }}={{ item.value }}"
loop:
- { key: GRUB_TIMEOUT_STYLE, value: "menu" }
- { key: GRUB_TIMEOUT, value: "5" }
- name: Regenerate grub config
ansible.builtin.command: update-grub
changed_when: true
- name: Ensure Comfy directories exist
ansible.builtin.file:
path: "{{ item.path }}"
state: directory
owner: "{{ comfy_user }}"
group: "{{ comfy_group }}"
mode: "{{ item.mode }}"
loop:
- { path: "{{ comfy_home }}", mode: "0775" }
- { path: "{{ comfy_root }}", mode: "0775" }
- { path: "{{ comfy_output_dir }}", mode: "2775" }
- { path: "{{ comfy_home }}/.local/bin", mode: "0775" }
- { path: "{{ comfy_home }}/piptmp", mode: "0775" }
- name: Ensure uv is installed for comfy
ansible.builtin.shell: |
set -euo pipefail
curl -LsSf https://astral.sh/uv/install.sh | sh
args:
creates: "{{ comfy_home }}/.local/bin/uv"
become_user: "{{ comfy_user }}"
- name: Ensure ComfyUI repo is present at pinned revision
ansible.builtin.git:
repo: "{{ comfy_repo_url }}"
dest: "{{ comfy_root }}"
version: "{{ comfy_repo_version }}"
update: true
become_user: "{{ comfy_user }}"
- name: Ensure ComfyUI venv exists
ansible.builtin.command:
argv:
- "{{ comfy_home }}/.local/bin/uv"
- venv
- --python
- "{{ comfy_python_version }}"
- "{{ comfy_venv }}"
args:
creates: "{{ comfy_venv }}/bin/python"
become_user: "{{ comfy_user }}"
- name: Install base Python packaging tools in Comfy venv
ansible.builtin.command:
argv:
- "{{ comfy_home }}/.local/bin/uv"
- pip
- install
- --python
- "{{ comfy_venv }}/bin/python"
- --upgrade
- pip
- setuptools
- wheel
become_user: "{{ comfy_user }}"
- name: Install ROCm PyTorch in Comfy venv
ansible.builtin.command:
argv:
- "{{ comfy_home }}/.local/bin/uv"
- pip
- install
- --python
- "{{ comfy_venv }}/bin/python"
- --index-url
- "{{ comfy_torch_index_url }}"
- torch
- torchvision
- torchaudio
environment:
TMPDIR: "{{ comfy_home }}/piptmp"
become_user: "{{ comfy_user }}"
- name: Install ComfyUI requirements in Comfy venv
ansible.builtin.command:
argv:
- "{{ comfy_home }}/.local/bin/uv"
- pip
- install
- --python
- "{{ comfy_venv }}/bin/python"
- -r
- "{{ comfy_root }}/requirements.txt"
environment:
TMPDIR: "{{ comfy_home }}/piptmp"
become_user: "{{ comfy_user }}"
- name: Install ComfyUI-Manager in Comfy venv
ansible.builtin.command:
argv:
- "{{ comfy_home }}/.local/bin/uv"
- pip
- install
- --python
- "{{ comfy_venv }}/bin/python"
- "comfyui-manager=={{ comfy_manager_version }}"
environment:
TMPDIR: "{{ comfy_home }}/piptmp"
become_user: "{{ comfy_user }}"
- name: Ensure output directories have group inheritance
ansible.builtin.shell: |
set -euo pipefail
find "{{ comfy_output_dir }}" -type d -exec chown {{ comfy_user }}:{{ comfy_group }} {} +
find "{{ comfy_output_dir }}" -type d -exec chmod 2775 {} +
changed_when: true
- name: Ensure output files are group writable
ansible.builtin.shell: |
set -euo pipefail
find "{{ comfy_output_dir }}" -type f -exec chown {{ comfy_user }}:{{ comfy_group }} {} +
find "{{ comfy_output_dir }}" -type f -exec chmod 0664 {} +
changed_when: true
- name: Install ComfyUI systemd unit
ansible.builtin.template:
src: ../templates/comfyui.service.j2
dest: /etc/systemd/system/comfyui.service
owner: root
group: root
mode: "0644"
- name: Ensure Samba include directory exists
ansible.builtin.file:
path: /etc/samba/smb.conf.d
state: directory
owner: root
group: root
mode: "0755"
- name: Install Samba share config for Comfy outputs
ansible.builtin.template:
src: ../templates/comfy-output.conf.j2
dest: /etc/samba/smb.conf.d/comfy-output.conf
owner: root
group: root
mode: "0644"
- name: Ensure Samba includes conf.d snippets
ansible.builtin.blockinfile:
path: /etc/samba/smb.conf
marker: "; {mark} ANSIBLE MANAGED COMFY OUTPUT INCLUDE"
block: |
include = /etc/samba/smb.conf.d/comfy-output.conf
- name: Reload systemd
ansible.builtin.systemd_service:
daemon_reload: true
- name: Ensure ComfyUI service is installed but disabled
ansible.builtin.systemd_service:
name: comfyui.service
enabled: false
- name: Ensure Samba service is enabled and running
ansible.builtin.systemd_service:
name: smbd.service
enabled: true
state: started
- name: Install netplan static IP config for rigby
ansible.builtin.template:
src: ../templates/99-rigby-static.yaml.j2
dest: /etc/netplan/99-rigby-static.yaml
owner: root
group: root
mode: "0644"
when: rigby_static_network_enabled | bool
- name: Apply static netplan configuration as final step
ansible.builtin.command: netplan apply
when: rigby_static_network_enabled | bool
changed_when: true
# --- vLLM ---
- name: Ensure vllm user exists
ansible.builtin.user:
name: "{{ vllm_user }}"
groups:
- render
- video
append: true
create_home: true
shell: /bin/bash
- name: Ensure vllm models directory exists
ansible.builtin.file:
path: "{{ vllm_models }}"
state: directory
owner: "{{ vllm_user }}"
group: "{{ vllm_user }}"
mode: "0755"
- name: Ensure uv is installed for vllm user
ansible.builtin.shell: |
set -euo pipefail
curl -LsSf https://astral.sh/uv/install.sh | sh
args:
creates: "{{ vllm_home }}/.local/bin/uv"
become_user: "{{ vllm_user }}"
- name: Ensure vllm venv exists
ansible.builtin.command:
argv:
- "{{ vllm_home }}/.local/bin/uv"
- venv
- --python
- "3.12"
- "{{ vllm_venv }}"
args:
creates: "{{ vllm_venv }}/bin/python"
become_user: "{{ vllm_user }}"
- name: Install vLLM in venv
ansible.builtin.command:
argv:
- "{{ vllm_home }}/.local/bin/uv"
- pip
- install
- --python
- "{{ vllm_venv }}/bin/python"
- vllm
- --extra-index-url
- "{{ vllm_rocm_wheels_url }}"
args:
creates: "{{ vllm_venv }}/bin/vllm"
become_user: "{{ vllm_user }}"
- name: Install vllm justfile
ansible.builtin.template:
src: ../templates/vllm-justfile.j2
dest: "{{ vllm_home }}/justfile"
owner: "{{ vllm_user }}"
group: "{{ vllm_user }}"
mode: "0644"
- name: Ensure vllm bashrc sources api key from file
ansible.builtin.lineinfile:
path: "{{ vllm_home }}/.bashrc"
line: "export VLLM_API_KEY=$(cat {{ vllm_home }}/.api_key)"
state: present
# --- LibreChat ---
- name: Ensure Docker service is enabled and running
ansible.builtin.systemd_service:
name: docker
enabled: true
state: started
- name: Ensure bryan is in docker group
ansible.builtin.user:
name: "{{ rigby_user }}"
groups:
- docker
append: true
- name: Ensure LibreChat repo is present
ansible.builtin.git:
repo: "{{ librechat_repo_url }}"
dest: "{{ librechat_root }}"
update: false
become_user: "{{ rigby_user }}"
- name: Install librechat.yaml config
ansible.builtin.template:
src: ../templates/librechat.yaml.j2
dest: "{{ librechat_root }}/librechat.yaml"
owner: "{{ rigby_user }}"
group: "{{ rigby_user }}"
mode: "0644"
- name: Install librechat systemd unit
ansible.builtin.template:
src: ../templates/librechat.service.j2
dest: /etc/systemd/system/librechat.service
owner: root
group: root
mode: "0644"
- name: Reload systemd and enable librechat service
ansible.builtin.systemd_service:
name: librechat.service
daemon_reload: true
enabled: true

View file

@ -0,0 +1,16 @@
network:
version: 2
renderer: networkd
ethernets:
{{ rigby_interface }}:
dhcp4: false
addresses:
- {{ rigby_static_ip }}
routes:
- to: default
via: {{ rigby_gateway }}
nameservers:
addresses:
{% for dns in rigby_dns %}
- {{ dns }}
{% endfor %}

View file

@ -0,0 +1,9 @@
[comfy-output]
path = {{ comfy_output_dir }}
browseable = yes
read only = no
guest ok = yes
force user = {{ comfy_user }}
force group = {{ comfy_group }}
create mask = 0664
directory mask = 2775

View file

@ -0,0 +1,22 @@
[Unit]
Description=ComfyUI
After=network-online.target
Wants=network-online.target
[Service]
Type=simple
User={{ comfy_user }}
Group={{ comfy_group }}
UMask=0002
WorkingDirectory={{ comfy_root }}
Environment=HOME={{ comfy_home }}
Environment=COMFYUI_PATH={{ comfy_root }}
Environment=PATH={{ comfy_home }}/.local/bin:{{ comfy_venv }}/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
ExecStart={{ comfy_venv }}/bin/python {{ comfy_root }}/main.py --highvram --enable-manager --listen 0.0.0.0 --port {{ comfy_port }} --disable-auto-launch
Restart=on-failure
RestartSec=5
NoNewPrivileges=true
PrivateTmp=true
[Install]
WantedBy=multi-user.target

View file

@ -0,0 +1,18 @@
[Unit]
Description=LibreChat
After=network-online.target docker.service
Wants=network-online.target
Requires=docker.service
[Service]
Type=simple
User={{ rigby_user }}
Group=docker
WorkingDirectory={{ librechat_root }}
ExecStart=/usr/bin/docker compose up
ExecStop=/usr/bin/docker compose down
Restart=on-failure
RestartSec=10
[Install]
WantedBy=multi-user.target

View file

@ -0,0 +1,43 @@
version: 1.3.5
cache: true
interface:
webSearch: false
runCode: false
mcpServers:
use: true
create: true
share: false
public: false
mcpServers:
searxng:
command: npx
args:
- -y
- mcp-searxng
env:
SEARXNG_URL: http://searxng:8080
timeout: 60000
fetch:
command: uvx
args:
- mcp-server-fetch
- --ignore-robots-txt
timeout: 60000
endpoints:
custom:
- name: "rigby-vllm"
apiKey: "${VLLM_API_KEY}"
baseURL: "http://host.docker.internal:{{ vllm_port }}/v1"
models:
default: []
fetch: true
titleConvo: true
titleModel: "current_model"
titleMessageRole: "user"
summarize: false
summaryModel: "current_model"
modelDisplayLabel: "Rigby vLLM"

View file

@ -0,0 +1,58 @@
{% raw %}# List available recipes
[private]
default:
@just --list
# Show currently running vLLM server
status:
@pgrep -a -f "vllm serve" || echo "No vLLM server running"
# Tail the vLLM log
logs:
@tail -f {% endraw %}{{ vllm_home }}/vllm.log{% raw %}
# Stop any running vLLM server and wait for VRAM to free
stop:
#!/usr/bin/env bash
set -euo pipefail
if pgrep -f "vllm serve" > /dev/null; then
echo "Stopping vLLM..."
pkill -TERM -f "vllm serve" || true
sleep 2
pkill -KILL -f "vllm serve" 2>/dev/null || true
fi
echo "Waiting for VRAM to release..."
for i in $(seq 1 30); do
used=$(rocm-smi --showmeminfo vram 2>/dev/null | awk '/VRAM Total Used Memory/ {print $NF}')
total=$(rocm-smi --showmeminfo vram 2>/dev/null | awk '/VRAM Total Memory \(B\)/ {print $NF}')
if [ -n "$used" ] && [ -n "$total" ] && [ "$total" -gt 0 ]; then
pct=$(( used * 100 / total ))
echo " VRAM: ${pct}%"
if [ "$pct" -lt 10 ]; then
echo "VRAM free."
exit 0
fi
fi
sleep 2
done
echo "Warning: VRAM did not fully release after 60s"
{% endraw %}
{% for model in vllm_models_list %}
# Serve {{ model.name }}
{{ model.recipe }}: stop
#!/usr/bin/env bash
source {{ vllm_home }}/.bashrc
nohup {{ vllm_home }}/vllm-venv/bin/vllm serve {{ vllm_models }}/{{ model.dir }} \
--served-model-name {{ model.name }} \
--host 0.0.0.0 \
--port {{ vllm_port }} \
--api-key ${VLLM_API_KEY} \
--dtype auto \
--max-model-len {{ model.max_model_len }} \
--gpu-memory-utilization {{ vllm_gpu_memory_utilization }}{% if model.tool_call_parser is defined %} \
--enable-auto-tool-choice \
--tool-call-parser {{ model.tool_call_parser }}{% endif %} \
> {{ vllm_home }}/vllm.log 2>&1 &
echo "Started {{ model.name }} (pid $!). Run 'just logs' to follow."
{% endfor %}