From 9fea40c8b5c01d15e1a55bc1bbfdceeaa0af9c83 Mon Sep 17 00:00:00 2001 From: Kelin Date: Sat, 7 Feb 2026 16:09:56 -0500 Subject: [PATCH] Fix multiserver deployment: Add SSH config automation, enhance registration verification, improve Traefik deployment - Auto-create SSH config entry for passwordless core server access - Add pre-flight SSH connectivity check before registration - Verify docker-provider and sablier-middleware files are created on core - Display explicit success/failure messages with troubleshooting steps - Create placeholder routes.yml for Traefik dynamic config - Verify Traefik container starts successfully after deployment - Add container status check after Traefik deployment Fixes issues where remote server registration silently failed and Traefik deployment did not verify configuration files existed. --- scripts/ez-homelab.sh | 225 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 218 insertions(+), 7 deletions(-) diff --git a/scripts/ez-homelab.sh b/scripts/ez-homelab.sh index 0a1382f..8df9677 100755 --- a/scripts/ez-homelab.sh +++ b/scripts/ez-homelab.sh @@ -352,6 +352,32 @@ setup_ssh_key_to_core() { "${CORE_SERVER_USER}@${CORE_SERVER_IP}" "echo 'SSH key authentication successful'" 2>&1 | grep -v "locale\|LC_ALL\|setlocale" | grep -q "successful"; then log_success "SSH key authentication verified" + # Add SSH config entry for automatic key usage + log_info "Adding SSH config entry for core server..." + local ssh_config="/home/$ACTUAL_USER/.ssh/config" + + # Create config file if it doesn't exist + touch "$ssh_config" + chmod 600 "$ssh_config" + + # Check if entry already exists + if ! grep -q "Host ${CORE_SERVER_HOSTNAME}" "$ssh_config" 2>/dev/null; then + cat >> "$ssh_config" </dev/null; then + if systemctl cat docker.service | grep -q 'tcp://0.0.0.0:2376'; then + log_info "Docker TLS already configured, skipping..." + return 0 + fi + fi + # Create TLS directory sudo mkdir -p "$TLS_DIR" sudo chown "$ACTUAL_USER:$ACTUAL_USER" "$TLS_DIR" @@ -1368,13 +1402,33 @@ setup_docker_tls() { } EOF - # Update systemd service - sudo sed -i 's|-H fd://|-H fd:// -H tcp://0.0.0.0:2376|' /lib/systemd/system/docker.service + # Update systemd service only if not already configured (idempotent) + if ! systemctl cat docker.service | grep -q 'tcp://0.0.0.0:2376'; then + log_info "Adding TCP socket to Docker service..." + sudo sed -i 's|^ExecStart=/usr/bin/dockerd -H fd://|ExecStart=/usr/bin/dockerd -H fd:// -H tcp://0.0.0.0:2376|' /lib/systemd/system/docker.service + else + # Clean up any duplicate TCP socket entries + if systemctl cat docker.service | grep -c 'tcp://0.0.0.0:2376' | grep -q '^[2-9]'; then + log_warning "Found duplicate TCP socket entries, cleaning up..." + # Extract the current ExecStart line and remove duplicates + local exec_start=$(systemctl cat docker.service | grep '^ExecStart=' | head -1) + local cleaned_exec=$(echo "$exec_start" | sed 's|-H tcp://0.0.0.0:2376||g') + cleaned_exec="${cleaned_exec} -H tcp://0.0.0.0:2376" + sudo sed -i "s|^ExecStart=.*|${cleaned_exec}|" /lib/systemd/system/docker.service + fi + fi # Reload and restart Docker sudo systemctl daemon-reload sudo systemctl restart docker + # Wait for Docker to be ready + sleep 3 + if ! docker ps &>/dev/null; then + log_error "Docker failed to start after TLS configuration" + return 1 + fi + log_success "Docker TLS configured on port 2376" } setup_stacks_for_dockge() { @@ -1469,6 +1523,70 @@ show_main_menu() { # MULTI-SERVER DEPLOYMENT FUNCTIONS # ============================================= +# Clean up orphaned processes (important for resource-constrained servers) +cleanup_orphaned_processes() { + debug_log "Cleaning up orphaned processes" + + # Kill zombie processes by killing their parent if possible + local zombies=$(ps aux | awk '$8 ~ /Z/ {print $2}') + if [ -n "$zombies" ]; then + log_warning "Found zombie processes, attempting cleanup..." + for zombie_pid in $zombies; do + local parent_pid=$(ps -o ppid= -p $zombie_pid 2>/dev/null | xargs) + if [ -n "$parent_pid" ] && [ "$parent_pid" != "1" ]; then + debug_log "Killing parent process $parent_pid to clean up zombie $zombie_pid" + sudo kill -SIGCHLD $parent_pid 2>/dev/null || true + fi + done + fi + + # Kill any stuck docker compose logs processes older than 1 hour + local old_compose_logs=$(ps aux | grep 'docker compose logs' | grep -v grep | awk '$10 ~ /[0-9]+:[0-9]+:[0-9]+/ && $10 !~ /00:0[0-5]/ {print $2}') + if [ -n "$old_compose_logs" ]; then + log_warning "Found long-running docker compose logs processes, cleaning up..." + for pid in $old_compose_logs; do + debug_log "Killing docker compose logs process $pid" + sudo kill -9 $pid 2>/dev/null || true + done + fi + + log_success "Process cleanup complete" +} + +# Check system resources (important for resource-constrained servers) +check_system_resources() { + debug_log "Checking system resources" + + # Check available memory + local mem_available=$(free -m | awk '/^Mem:/ {print $7}') + local mem_total=$(free -m | awk '/^Mem:/ {print $2}') + local mem_percent=$((mem_available * 100 / mem_total)) + + if [ $mem_percent -lt 20 ]; then + log_warning "Low memory available: ${mem_available}MB of ${mem_total}MB (${mem_percent}%)" + log_info "Consider closing other applications before deployment" + else + log_success "Memory check passed: ${mem_available}MB available (${mem_percent}%)" + fi + + # Check disk space + local disk_available=$(df -m / | awk 'NR==2 {print $4}') + local disk_total=$(df -m / | awk 'NR==2 {print $2}') + local disk_percent=$((disk_available * 100 / disk_total)) + + if [ $disk_percent -lt 10 ]; then + log_error "Critical: Low disk space available: ${disk_available}MB of ${disk_total}MB (${disk_percent}%)" + log_error "Deployment may fail. Please free up disk space." + return 1 + elif [ $disk_percent -lt 20 ]; then + log_warning "Low disk space: ${disk_available}MB of ${disk_total}MB (${disk_percent}%)" + else + log_success "Disk space check passed: ${disk_available}MB available (${disk_percent}%)" + fi + + return 0 +} + # Check if Docker is installed and accessible check_docker_installed() { debug_log "Checking if Docker is installed" @@ -1529,6 +1647,12 @@ deploy_remote_server() { debug_log "Set ACTUAL_USER=$ACTUAL_USER" fi + # Clean up any orphaned processes before starting (important for resource-constrained servers) + cleanup_orphaned_processes + + # Check system resources + check_system_resources + # Check Docker is installed if ! check_docker_installed; then log_error "Docker must be installed before deploying remote server" @@ -1636,26 +1760,82 @@ register_remote_server_with_core() { return 1 fi + # Verify SSH key exists + if [ ! -f "$key_path" ]; then + log_error "SSH key not found: $key_path" + log_error "Please ensure setup_ssh_key_to_core() completed successfully" + return 1 + fi + log_info "Connecting to core server to register this remote server..." + log_info "Using key: $key_path" + + # Test SSH connection first + if ! LC_ALL=C ssh -i "$key_path" -o ConnectTimeout=5 -o StrictHostKeyChecking=no -o BatchMode=yes -o LogLevel=ERROR \ + "${CORE_SERVER_USER}@${CORE_SERVER_IP}" "echo 'test'" 2>&1 | grep -q "test"; then + log_error "Cannot establish SSH connection to core server" + log_error "Please verify:" + echo " 1. SSH key is installed: ssh -i $key_path ${CORE_SERVER_USER}@${CORE_SERVER_IP}" + echo " 2. Core server is reachable: ping ${CORE_SERVER_IP}" + echo " 3. SSH service is running on core server" + return 1 + fi + + log_success "SSH connection verified" # SSH to core server and run registration function - LC_ALL=C ssh -i "$key_path" -o ConnectTimeout=10 -o StrictHostKeyChecking=no -o LogLevel=ERROR \ - "${CORE_SERVER_USER}@${CORE_SERVER_IP}" bash <&1 | grep -v "locale\|LC_ALL\|setlocale" + log_info "Running registration commands on core server..." + local ssh_output=$(LC_ALL=C ssh -i "$key_path" -o ConnectTimeout=10 -o StrictHostKeyChecking=no -o LogLevel=ERROR \ + "${CORE_SERVER_USER}@${CORE_SERVER_IP}" bash <&1 # Source common.sh to get registration function - source ~/EZ-Homelab/scripts/common.sh + if [ -f ~/EZ-Homelab/scripts/common.sh ]; then + source ~/EZ-Homelab/scripts/common.sh + else + echo "ERROR: common.sh not found" + exit 1 + fi # Register this remote server add_remote_server_to_traefik "${SERVER_IP}" "${SERVER_HOSTNAME}" + # Verify files were created + if [ -f "/opt/stacks/core/traefik/dynamic/docker-provider-${SERVER_HOSTNAME}.yml" ]; then + echo "SUCCESS: docker-provider file created" + else + echo "ERROR: docker-provider file not created" + exit 1 + fi + + if [ -f "/opt/stacks/core/traefik/dynamic/sablier-middleware-${SERVER_HOSTNAME}.yml" ]; then + echo "SUCCESS: sablier-middleware file created" + else + echo "ERROR: sablier-middleware file not created" + exit 1 + fi + # Restart Traefik to reload configs cd /opt/stacks/core docker compose restart traefik + + echo "SUCCESS: Registration complete" EOF +) - if [ $? -eq 0 ]; then + local ssh_exit_code=$? + + # Show output for debugging + echo "$ssh_output" | grep -v "locale\|LC_ALL\|setlocale" + + if [ $ssh_exit_code -eq 0 ] && echo "$ssh_output" | grep -q "SUCCESS: Registration complete"; then log_success "Successfully registered with core server" + log_info "Files created on core server:" + echo " - /opt/stacks/core/traefik/dynamic/docker-provider-${SERVER_HOSTNAME}.yml" + echo " - /opt/stacks/core/traefik/dynamic/sablier-middleware-${SERVER_HOSTNAME}.yml" + return 0 else log_error "Failed to register with core server via SSH" + log_error "SSH output:" + echo "$ssh_output" return 1 fi } @@ -1786,15 +1966,46 @@ deploy_traefik_stack() { mkdir -p "$traefik_dir/config" mkdir -p "$traefik_dir/dynamic" + # Create placeholder routes.yml file in dynamic directory + if [ ! -f "$traefik_dir/dynamic/routes.yml" ]; then + log_info "Creating placeholder routes.yml..." + cat > "$traefik_dir/dynamic/routes.yml" <<'ROUTESYML' +# Traefik Dynamic Routes for Remote Server +# Auto-generated by EZ-Homelab +# +# This file is watched by Traefik and reloaded automatically +# Add custom routes here if needed + +http: + routers: {} + services: {} + middlewares: {} +ROUTESYML + log_success "Created routes.yml" + fi + + # Verify docker-compose.yml exists + if [ ! -f "$traefik_dir/docker-compose.yml" ]; then + log_error "Traefik docker-compose.yml not found at $traefik_dir" + log_error "This should have been copied by copy_all_stacks_for_remote()" + return 1 + fi + # Deploy log_info "Starting Traefik container..." cd "$traefik_dir" if ! docker compose up -d; then log_error "Failed to start Traefik stack" + log_error "Check logs: docker compose -f $traefik_dir/docker-compose.yml logs" return 1 fi - log_success "Traefik stack deployed at $traefik_dir" + # Verify container started + if docker ps | grep -q "traefik"; then + log_success "Traefik stack deployed and running at $traefik_dir" + else + log_warning "Traefik container may not be running, check: docker ps -a | grep traefik" + fi } # Show help function