diff --git a/ROUND_7_PREP.md b/ROUND_7_PREP.md new file mode 100644 index 0000000..76b223b --- /dev/null +++ b/ROUND_7_PREP.md @@ -0,0 +1,266 @@ +# Round 7 Testing - Preparation and Safety Guidelines + +## Mission Context +Test AI-Homelab deployment scripts with focus on **safe cleanup and recovery** procedures. Round 6 revealed that aggressive cleanup operations caused system crashes requiring hard reboots and BIOS recovery. + +## Critical Safety Requirements - NEW for Round 7 + +### ⚠️ SYSTEM CRASH PREVENTION +**Issue from Round 6**: Aggressive cleanup operations caused system crashes requiring power cycles and BIOS recovery attempts. + +**Root Causes Identified**: +1. Removing directories while Docker containers were actively using them +2. Aggressive `rm -rf` operations on Docker volumes while containers running +3. No graceful shutdown sequence before cleanup +4. Docker volume operations causing filesystem corruption + +### Safe Testing Procedure + +#### Before Each Test Run +1. **Always use the new reset script** instead of manual cleanup +2. **Never** run cleanup commands while containers are running +3. **Always** stop containers gracefully before removing files +4. **Monitor** system resources during operations + +#### Using the Safe Reset Script +```bash +cd ~/AI-Homelab/scripts +sudo ./reset-test-environment.sh +``` + +This script: +- ✅ Stops all containers gracefully (proper shutdown) +- ✅ Waits for containers to fully stop +- ✅ Removes Docker volumes safely +- ✅ Cleans directories only after containers stopped +- ✅ Preserves system packages and settings +- ✅ Does NOT touch Docker installation +- ✅ Does NOT modify system files + +#### What NOT to Do (Dangerous Operations) +```bash +# ❌ NEVER do these while containers are running: +rm -rf /opt/stacks/core/traefik # Can corrupt active containers +rm -rf /var/lib/docker/volumes/* # Filesystem corruption risk +docker volume rm $(docker volume ls -q) # Removes volumes containers need +find /var/lib/docker -exec rm -rf {} + # EXTREMELY DANGEROUS + +# ❌ NEVER force remove running containers: +docker rm -f $(docker ps -aq) # Can cause state corruption + +# ❌ NEVER use pkill on Docker processes: +pkill -9 docker # Can corrupt Docker daemon state +``` + +#### Safe Cleanup Sequence (Manual) +If you need to clean up manually: +```bash +# 1. Stop services gracefully +cd /opt/stacks/core +docker compose down # Waits for clean shutdown + +# 2. Wait for full stop +sleep 5 + +# 3. Then and only then remove files +rm -rf /opt/stacks/core/traefik +rm -rf /opt/stacks/core/authelia + +# 4. Remove volumes after containers stopped +docker volume rm core_authelia-data +``` + +## Round 7 Objectives + +### Primary Goals +1. ✅ Verify safe reset script works without system crashes +2. ✅ Test full deployment after reset (round-trip testing) +3. ✅ Validate no file system corruption occurs +4. ✅ Ensure containers start cleanly after reset +5. ✅ Document any remaining edge cases + +### Testing Checklist + +#### Pre-Testing Setup +- [ ] System is stable and responsive +- [ ] All previous containers stopped cleanly +- [ ] Disk space sufficient (5GB+ free) +- [ ] No filesystem errors: `dmesg | grep -i error` +- [ ] Docker daemon healthy: `systemctl status docker` + +#### Round 7 Test Sequence +1. **Clean slate** using reset script + ```bash + sudo ./scripts/reset-test-environment.sh + ``` + - [ ] Script completes without errors + - [ ] System remains responsive + - [ ] No kernel panics or crashes + - [ ] All volumes removed cleanly + +2. **Fresh deployment** with improved scripts + ```bash + sudo ./scripts/setup-homelab.sh + ``` + - [ ] Completes successfully + - [ ] No permission errors + - [ ] Password hash generated correctly + - [ ] Credentials saved properly + +3. **Deploy infrastructure** + ```bash + sudo ./scripts/deploy-homelab.sh + ``` + - [ ] Containers start cleanly + - [ ] No file conflicts + - [ ] Authelia initializes properly + - [ ] Credentials work immediately + +4. **Verify services** + - [ ] Traefik accessible and routing + - [ ] Authelia login works + - [ ] Dockge UI accessible + - [ ] SSL certificates generating + +5. **Test reset again** (idempotency) + ```bash + sudo ./scripts/reset-test-environment.sh + ``` + - [ ] Stops everything gracefully + - [ ] No orphaned containers + - [ ] No volume leaks + - [ ] System stable after reset + +## Changes Made for Round 7 + +### New Files +- **`scripts/reset-test-environment.sh`** - Safe cleanup script with proper shutdown sequence + +### Modified Files +- **`scripts/deploy-homelab.sh`**: + - Added graceful container stop before config file operations + - Removed automatic database cleanup (now use reset script instead) + - Added safety checks before rm operations + - Better warnings about existing databases + +### Removed Dangerous Operations +```bash +# REMOVED from deploy-homelab.sh: +docker compose up -d authelia 2>&1 | grep -q "encryption key" && { + docker compose down authelia + sudo rm -rf /var/lib/docker/volumes/core_authelia-data/_data/* +} +# This was causing crashes - containers couldn't handle abrupt volume removal + +# REMOVED blind directory removal: +rm -rf /opt/stacks/core/traefik /opt/stacks/core/authelia +# Now checks if containers are running first +``` + +## System Health Monitoring + +### Before Each Test Run +```bash +# Check system health +free -h # Memory available +df -h # Disk space +dmesg | tail -20 # Recent kernel messages +systemctl status docker # Docker daemon health +docker ps # Running containers +``` + +### During Testing +- Monitor system logs: `journalctl -f` +- Watch Docker logs: `docker compose logs -f` +- Check resource usage: `htop` or `top` + +### After Issues +```bash +# If system becomes unresponsive: +# 1. DO NOT hard power off immediately +# 2. Try to SSH in from another machine +# 3. Gracefully stop Docker: systemctl stop docker +# 4. Wait 30 seconds for disk writes to complete +# 5. Then reboot: systemctl reboot + +# Check for filesystem corruption after boot: +sudo dmesg | grep -i error +sudo journalctl -xb | grep -i error +``` + +## Recovery Procedures + +### If Deploy Script Hangs +```bash +# In another terminal: +cd /opt/stacks/core +docker compose ps # See what's running +docker compose logs authelia # Check for errors +docker compose down # Stop gracefully +# Then re-run deploy +``` + +### If Authelia Won't Start (Encryption Key Error) +```bash +# Use the reset script: +sudo ./scripts/reset-test-environment.sh +# Then start fresh deployment +``` + +### If System Crashed During Testing +```bash +# After reboot: +# 1. Check Docker state +systemctl status docker +docker ps -a # Look for crashed containers + +# 2. Clean up properly +cd /opt/stacks/core +docker compose down --remove-orphans + +# 3. Remove corrupted volumes +docker volume prune -f + +# 4. Start fresh with reset script +sudo ./scripts/reset-test-environment.sh +``` + +## Success Criteria for Round 7 + +### Must Have +- ✅ Reset script completes without system crashes +- ✅ Can deploy and reset multiple times safely +- ✅ No filesystem corruption after any operation +- ✅ System remains responsive throughout testing +- ✅ All containers stop gracefully + +### Should Have +- ✅ Clear warnings before destructive operations +- ✅ Confirmation prompts for cleanup +- ✅ Progress indicators during long operations +- ✅ Health checks before and after operations + +### Nice to Have +- ⭐ Automatic backup before reset +- ⭐ Rollback capability +- ⭐ System health validation +- ⭐ Detailed logging of all operations + +## Emergency Contacts / References + +- Docker best practices: https://docs.docker.com/config/daemon/ +- Linux filesystem safety: `man sync`, `man fsync` +- systemd service management: `man systemctl` + +## Post-Round 7 Review + +### Document These +- [ ] Any new crash scenarios discovered +- [ ] System resource usage patterns +- [ ] Time required for clean operations +- [ ] Any remaining unsafe operations +- [ ] User experience improvements needed + +--- + +**Remember**: System stability is more important than testing speed. Always wait for clean shutdowns and never force operations. diff --git a/scripts/deploy-homelab.sh b/scripts/deploy-homelab.sh index 8e67f27..c499916 100755 --- a/scripts/deploy-homelab.sh +++ b/scripts/deploy-homelab.sh @@ -130,6 +130,13 @@ echo "" # Copy core stack files log_info "Preparing core stack configuration files..." +# Safety: Stop existing core stack if running (prevents file conflicts) +if [ -f "/opt/stacks/core/docker-compose.yml" ]; then + log_info "Stopping existing core stack for safe reconfiguration..." + cd /opt/stacks/core && docker compose down 2>/dev/null || true + sleep 2 +fi + # Clean up any incorrect directory structure from previous runs if [ -d "/opt/stacks/core/traefik/acme.json" ]; then log_warning "Removing incorrectly created acme.json directory" @@ -143,8 +150,14 @@ fi # Copy compose file cp "$REPO_DIR/docker-compose/core.yml" /opt/stacks/core/docker-compose.yml -# Remove existing config directories and copy fresh ones -rm -rf /opt/stacks/core/traefik /opt/stacks/core/authelia +# Safely remove and replace config directories +if [ -d "/opt/stacks/core/traefik" ]; then + rm -rf /opt/stacks/core/traefik +fi +if [ -d "/opt/stacks/core/authelia" ]; then + rm -rf /opt/stacks/core/authelia +fi + cp -r "$REPO_DIR/config-templates/traefik" /opt/stacks/core/ cp -r "$REPO_DIR/config-templates/authelia" /opt/stacks/core/ @@ -222,14 +235,12 @@ fi # Clean up old Authelia database if encryption key changed # This prevents "encryption key does not appear to be valid" errors if [ -d "/var/lib/docker/volumes/core_authelia-data/_data" ]; then - log_info "Checking for Authelia database encryption key issues..." - # Test if Authelia can start, if not, clean the database - docker compose up -d authelia 2>&1 | grep -q "encryption key" && { - log_warning "Encryption key mismatch detected, cleaning Authelia database..." - docker compose down authelia - sudo rm -rf /var/lib/docker/volumes/core_authelia-data/_data/* - log_success "Authelia database cleaned" - } || log_info "Database check passed" + log_info "Checking for existing Authelia database..." + # Check if database exists and might have encryption key mismatch + if [ -f "/var/lib/docker/volumes/core_authelia-data/_data/db.sqlite3" ]; then + log_warning "Existing Authelia database found from previous deployment" + log_info "If deployment fails with encryption key errors, run: sudo ./scripts/reset-test-environment.sh" + fi fi # Deploy core stack diff --git a/scripts/reset-test-environment.sh b/scripts/reset-test-environment.sh new file mode 100755 index 0000000..096fdca --- /dev/null +++ b/scripts/reset-test-environment.sh @@ -0,0 +1,160 @@ +#!/bin/bash +# AI-Homelab Test Environment Reset Script +# Safe cleanup for testing between rounds +# Run as: sudo ./reset-test-environment.sh + +set -e # Exit on error + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# Log functions +log_info() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +log_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" +} + +log_warning() { + echo -e "${YELLOW}[WARNING]${NC} $1" +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +# Check if running as root +if [ "$EUID" -ne 0 ]; then + log_error "Please run as root (use: sudo ./reset-test-environment.sh)" + exit 1 +fi + +# Get the actual user who invoked sudo +ACTUAL_USER="${SUDO_USER:-$USER}" +if [ "$ACTUAL_USER" = "root" ]; then + log_error "Please run this script with sudo, not as root user" + exit 1 +fi + +echo "==========================================" +log_warning "AI-Homelab Test Environment Reset" +echo "==========================================" +echo "" +log_warning "This will safely remove all deployed services and data" +log_warning "This is intended for testing - DO NOT use in production!" +echo "" +read -p "Are you sure you want to reset? (type 'yes' to continue): " CONFIRM + +if [ "$CONFIRM" != "yes" ]; then + log_info "Reset cancelled" + exit 0 +fi + +echo "" +log_info "Starting safe cleanup process..." +echo "" + +# Step 1: Stop all Docker Compose stacks gracefully +log_info "Step 1/6: Stopping all Docker Compose stacks..." + +if [ -d "/opt/stacks/dashboards" ]; then + cd /opt/stacks/dashboards && docker compose down 2>/dev/null || true + log_success "Dashboards stack stopped" +fi + +if [ -d "/opt/stacks/infrastructure" ]; then + cd /opt/stacks/infrastructure && docker compose down 2>/dev/null || true + log_success "Infrastructure stack stopped" +fi + +if [ -d "/opt/stacks/core" ]; then + cd /opt/stacks/core && docker compose down 2>/dev/null || true + log_success "Core stack stopped" +fi + +# Wait for containers to fully stop +sleep 3 +log_success "All stacks stopped gracefully" +echo "" + +# Step 2: Remove Docker volumes (data will be lost) +log_info "Step 2/6: Removing Docker volumes..." + +# List volumes to remove +VOLUMES=$(docker volume ls -q | grep -E "^(core_|infrastructure_|dashboards_)" 2>/dev/null || true) + +if [ -n "$VOLUMES" ]; then + echo "$VOLUMES" | while read vol; do + docker volume rm "$vol" 2>/dev/null && log_success "Removed volume: $vol" || log_warning "Could not remove volume: $vol" + done +else + log_info "No homelab volumes found" +fi + +echo "" + +# Step 3: Remove stack directories (configs will be regenerated) +log_info "Step 3/6: Removing stack configuration directories..." + +if [ -d "/opt/stacks" ]; then + rm -rf /opt/stacks/core + rm -rf /opt/stacks/infrastructure + rm -rf /opt/stacks/dashboards + log_success "Stack directories removed" +else + log_info "No stack directories found" +fi + +if [ -d "/opt/dockge/data" ]; then + rm -rf /opt/dockge/data/* + log_success "Dockge data cleared" +fi + +echo "" + +# Step 4: Clean up temporary files +log_info "Step 4/6: Cleaning temporary files..." + +rm -f /tmp/authelia_admin_credentials.tmp +rm -f /tmp/nvidia*.log +log_success "Temporary files cleaned" +echo "" + +# Step 5: Remove Docker networks +log_info "Step 5/6: Removing Docker networks..." + +docker network rm homelab-network 2>/dev/null && log_success "Removed homelab-network" || log_info "homelab-network not found" +docker network rm traefik-network 2>/dev/null && log_success "Removed traefik-network" || log_info "traefik-network not found" +docker network rm dockerproxy-network 2>/dev/null && log_success "Removed dockerproxy-network" || log_info "dockerproxy-network not found" +docker network rm media-network 2>/dev/null && log_success "Removed media-network" || log_info "media-network not found" + +echo "" + +# Step 6: Prune unused Docker resources +log_info "Step 6/6: Pruning unused Docker resources..." + +docker system prune -f --volumes 2>&1 | grep -E "(Deleted|Total reclaimed)" || true +log_success "Docker cleanup complete" +echo "" + +# Final summary +echo "==========================================" +log_success "Test environment reset complete!" +echo "==========================================" +echo "" +log_info "System is ready for next round of testing" +log_info "" +log_info "Next steps:" +echo " 1. Ensure .env file is properly configured" +echo " 2. Run: sudo ./setup-homelab.sh" +echo " 3. Run: sudo ./deploy-homelab.sh" +echo "" +log_info "Note: Docker and system packages are NOT removed" +log_info "User groups and firewall settings are preserved" +echo ""