Fix multiserver deployment: Add SSH config automation, enhance registration verification, improve Traefik deployment

- Auto-create SSH config entry for passwordless core server access
- Add pre-flight SSH connectivity check before registration
- Verify docker-provider and sablier-middleware files are created on core
- Display explicit success/failure messages with troubleshooting steps
- Create placeholder routes.yml for Traefik dynamic config
- Verify Traefik container starts successfully after deployment
- Add container status check after Traefik deployment

Fixes issues where remote server registration silently failed and
Traefik deployment did not verify configuration files existed.
This commit is contained in:
Kelin
2026-02-07 16:09:56 -05:00
parent 44b529a7cb
commit 9fea40c8b5

View File

@@ -352,6 +352,32 @@ setup_ssh_key_to_core() {
"${CORE_SERVER_USER}@${CORE_SERVER_IP}" "echo 'SSH key authentication successful'" 2>&1 | grep -v "locale\|LC_ALL\|setlocale" | grep -q "successful"; then "${CORE_SERVER_USER}@${CORE_SERVER_IP}" "echo 'SSH key authentication successful'" 2>&1 | grep -v "locale\|LC_ALL\|setlocale" | grep -q "successful"; then
log_success "SSH key authentication verified" log_success "SSH key authentication verified"
# Add SSH config entry for automatic key usage
log_info "Adding SSH config entry for core server..."
local ssh_config="/home/$ACTUAL_USER/.ssh/config"
# Create config file if it doesn't exist
touch "$ssh_config"
chmod 600 "$ssh_config"
# Check if entry already exists
if ! grep -q "Host ${CORE_SERVER_HOSTNAME}" "$ssh_config" 2>/dev/null; then
cat >> "$ssh_config" <<SSHCONFIG
# Auto-generated by EZ-Homelab for remote server ${SERVER_HOSTNAME}
Host ${CORE_SERVER_HOSTNAME}
HostName ${CORE_SERVER_IP}
User ${CORE_SERVER_USER}
IdentityFile ${key_path}
StrictHostKeyChecking no
UserKnownHostsFile /dev/null
LogLevel ERROR
SSHCONFIG
log_success "SSH config entry added for ${CORE_SERVER_HOSTNAME}"
else
log_info "SSH config entry already exists"
fi
# Export key path for use by other functions # Export key path for use by other functions
export SSH_KEY_PATH="$key_path" export SSH_KEY_PATH="$key_path"
return 0 return 0
@@ -1330,6 +1356,14 @@ perform_deployment() {
setup_docker_tls() { setup_docker_tls() {
local TLS_DIR="/home/$ACTUAL_USER/EZ-Homelab/docker-tls" local TLS_DIR="/home/$ACTUAL_USER/EZ-Homelab/docker-tls"
# Check if TLS is already configured
if [ -f "/etc/docker/daemon.json" ] && grep -q '"tls": true' /etc/docker/daemon.json 2>/dev/null; then
if systemctl cat docker.service | grep -q 'tcp://0.0.0.0:2376'; then
log_info "Docker TLS already configured, skipping..."
return 0
fi
fi
# Create TLS directory # Create TLS directory
sudo mkdir -p "$TLS_DIR" sudo mkdir -p "$TLS_DIR"
sudo chown "$ACTUAL_USER:$ACTUAL_USER" "$TLS_DIR" sudo chown "$ACTUAL_USER:$ACTUAL_USER" "$TLS_DIR"
@@ -1368,13 +1402,33 @@ setup_docker_tls() {
} }
EOF EOF
# Update systemd service # Update systemd service only if not already configured (idempotent)
sudo sed -i 's|-H fd://|-H fd:// -H tcp://0.0.0.0:2376|' /lib/systemd/system/docker.service if ! systemctl cat docker.service | grep -q 'tcp://0.0.0.0:2376'; then
log_info "Adding TCP socket to Docker service..."
sudo sed -i 's|^ExecStart=/usr/bin/dockerd -H fd://|ExecStart=/usr/bin/dockerd -H fd:// -H tcp://0.0.0.0:2376|' /lib/systemd/system/docker.service
else
# Clean up any duplicate TCP socket entries
if systemctl cat docker.service | grep -c 'tcp://0.0.0.0:2376' | grep -q '^[2-9]'; then
log_warning "Found duplicate TCP socket entries, cleaning up..."
# Extract the current ExecStart line and remove duplicates
local exec_start=$(systemctl cat docker.service | grep '^ExecStart=' | head -1)
local cleaned_exec=$(echo "$exec_start" | sed 's|-H tcp://0.0.0.0:2376||g')
cleaned_exec="${cleaned_exec} -H tcp://0.0.0.0:2376"
sudo sed -i "s|^ExecStart=.*|${cleaned_exec}|" /lib/systemd/system/docker.service
fi
fi
# Reload and restart Docker # Reload and restart Docker
sudo systemctl daemon-reload sudo systemctl daemon-reload
sudo systemctl restart docker sudo systemctl restart docker
# Wait for Docker to be ready
sleep 3
if ! docker ps &>/dev/null; then
log_error "Docker failed to start after TLS configuration"
return 1
fi
log_success "Docker TLS configured on port 2376" log_success "Docker TLS configured on port 2376"
} }
setup_stacks_for_dockge() { setup_stacks_for_dockge() {
@@ -1469,6 +1523,70 @@ show_main_menu() {
# MULTI-SERVER DEPLOYMENT FUNCTIONS # MULTI-SERVER DEPLOYMENT FUNCTIONS
# ============================================= # =============================================
# Clean up orphaned processes (important for resource-constrained servers)
cleanup_orphaned_processes() {
debug_log "Cleaning up orphaned processes"
# Kill zombie processes by killing their parent if possible
local zombies=$(ps aux | awk '$8 ~ /Z/ {print $2}')
if [ -n "$zombies" ]; then
log_warning "Found zombie processes, attempting cleanup..."
for zombie_pid in $zombies; do
local parent_pid=$(ps -o ppid= -p $zombie_pid 2>/dev/null | xargs)
if [ -n "$parent_pid" ] && [ "$parent_pid" != "1" ]; then
debug_log "Killing parent process $parent_pid to clean up zombie $zombie_pid"
sudo kill -SIGCHLD $parent_pid 2>/dev/null || true
fi
done
fi
# Kill any stuck docker compose logs processes older than 1 hour
local old_compose_logs=$(ps aux | grep 'docker compose logs' | grep -v grep | awk '$10 ~ /[0-9]+:[0-9]+:[0-9]+/ && $10 !~ /00:0[0-5]/ {print $2}')
if [ -n "$old_compose_logs" ]; then
log_warning "Found long-running docker compose logs processes, cleaning up..."
for pid in $old_compose_logs; do
debug_log "Killing docker compose logs process $pid"
sudo kill -9 $pid 2>/dev/null || true
done
fi
log_success "Process cleanup complete"
}
# Check system resources (important for resource-constrained servers)
check_system_resources() {
debug_log "Checking system resources"
# Check available memory
local mem_available=$(free -m | awk '/^Mem:/ {print $7}')
local mem_total=$(free -m | awk '/^Mem:/ {print $2}')
local mem_percent=$((mem_available * 100 / mem_total))
if [ $mem_percent -lt 20 ]; then
log_warning "Low memory available: ${mem_available}MB of ${mem_total}MB (${mem_percent}%)"
log_info "Consider closing other applications before deployment"
else
log_success "Memory check passed: ${mem_available}MB available (${mem_percent}%)"
fi
# Check disk space
local disk_available=$(df -m / | awk 'NR==2 {print $4}')
local disk_total=$(df -m / | awk 'NR==2 {print $2}')
local disk_percent=$((disk_available * 100 / disk_total))
if [ $disk_percent -lt 10 ]; then
log_error "Critical: Low disk space available: ${disk_available}MB of ${disk_total}MB (${disk_percent}%)"
log_error "Deployment may fail. Please free up disk space."
return 1
elif [ $disk_percent -lt 20 ]; then
log_warning "Low disk space: ${disk_available}MB of ${disk_total}MB (${disk_percent}%)"
else
log_success "Disk space check passed: ${disk_available}MB available (${disk_percent}%)"
fi
return 0
}
# Check if Docker is installed and accessible # Check if Docker is installed and accessible
check_docker_installed() { check_docker_installed() {
debug_log "Checking if Docker is installed" debug_log "Checking if Docker is installed"
@@ -1529,6 +1647,12 @@ deploy_remote_server() {
debug_log "Set ACTUAL_USER=$ACTUAL_USER" debug_log "Set ACTUAL_USER=$ACTUAL_USER"
fi fi
# Clean up any orphaned processes before starting (important for resource-constrained servers)
cleanup_orphaned_processes
# Check system resources
check_system_resources
# Check Docker is installed # Check Docker is installed
if ! check_docker_installed; then if ! check_docker_installed; then
log_error "Docker must be installed before deploying remote server" log_error "Docker must be installed before deploying remote server"
@@ -1636,26 +1760,82 @@ register_remote_server_with_core() {
return 1 return 1
fi fi
# Verify SSH key exists
if [ ! -f "$key_path" ]; then
log_error "SSH key not found: $key_path"
log_error "Please ensure setup_ssh_key_to_core() completed successfully"
return 1
fi
log_info "Connecting to core server to register this remote server..." log_info "Connecting to core server to register this remote server..."
log_info "Using key: $key_path"
# Test SSH connection first
if ! LC_ALL=C ssh -i "$key_path" -o ConnectTimeout=5 -o StrictHostKeyChecking=no -o BatchMode=yes -o LogLevel=ERROR \
"${CORE_SERVER_USER}@${CORE_SERVER_IP}" "echo 'test'" 2>&1 | grep -q "test"; then
log_error "Cannot establish SSH connection to core server"
log_error "Please verify:"
echo " 1. SSH key is installed: ssh -i $key_path ${CORE_SERVER_USER}@${CORE_SERVER_IP}"
echo " 2. Core server is reachable: ping ${CORE_SERVER_IP}"
echo " 3. SSH service is running on core server"
return 1
fi
log_success "SSH connection verified"
# SSH to core server and run registration function # SSH to core server and run registration function
LC_ALL=C ssh -i "$key_path" -o ConnectTimeout=10 -o StrictHostKeyChecking=no -o LogLevel=ERROR \ log_info "Running registration commands on core server..."
"${CORE_SERVER_USER}@${CORE_SERVER_IP}" bash <<EOF 2>&1 | grep -v "locale\|LC_ALL\|setlocale" local ssh_output=$(LC_ALL=C ssh -i "$key_path" -o ConnectTimeout=10 -o StrictHostKeyChecking=no -o LogLevel=ERROR \
"${CORE_SERVER_USER}@${CORE_SERVER_IP}" bash <<EOF 2>&1
# Source common.sh to get registration function # Source common.sh to get registration function
source ~/EZ-Homelab/scripts/common.sh if [ -f ~/EZ-Homelab/scripts/common.sh ]; then
source ~/EZ-Homelab/scripts/common.sh
else
echo "ERROR: common.sh not found"
exit 1
fi
# Register this remote server # Register this remote server
add_remote_server_to_traefik "${SERVER_IP}" "${SERVER_HOSTNAME}" add_remote_server_to_traefik "${SERVER_IP}" "${SERVER_HOSTNAME}"
# Verify files were created
if [ -f "/opt/stacks/core/traefik/dynamic/docker-provider-${SERVER_HOSTNAME}.yml" ]; then
echo "SUCCESS: docker-provider file created"
else
echo "ERROR: docker-provider file not created"
exit 1
fi
if [ -f "/opt/stacks/core/traefik/dynamic/sablier-middleware-${SERVER_HOSTNAME}.yml" ]; then
echo "SUCCESS: sablier-middleware file created"
else
echo "ERROR: sablier-middleware file not created"
exit 1
fi
# Restart Traefik to reload configs # Restart Traefik to reload configs
cd /opt/stacks/core cd /opt/stacks/core
docker compose restart traefik docker compose restart traefik
echo "SUCCESS: Registration complete"
EOF EOF
)
if [ $? -eq 0 ]; then local ssh_exit_code=$?
# Show output for debugging
echo "$ssh_output" | grep -v "locale\|LC_ALL\|setlocale"
if [ $ssh_exit_code -eq 0 ] && echo "$ssh_output" | grep -q "SUCCESS: Registration complete"; then
log_success "Successfully registered with core server" log_success "Successfully registered with core server"
log_info "Files created on core server:"
echo " - /opt/stacks/core/traefik/dynamic/docker-provider-${SERVER_HOSTNAME}.yml"
echo " - /opt/stacks/core/traefik/dynamic/sablier-middleware-${SERVER_HOSTNAME}.yml"
return 0
else else
log_error "Failed to register with core server via SSH" log_error "Failed to register with core server via SSH"
log_error "SSH output:"
echo "$ssh_output"
return 1 return 1
fi fi
} }
@@ -1786,15 +1966,46 @@ deploy_traefik_stack() {
mkdir -p "$traefik_dir/config" mkdir -p "$traefik_dir/config"
mkdir -p "$traefik_dir/dynamic" mkdir -p "$traefik_dir/dynamic"
# Create placeholder routes.yml file in dynamic directory
if [ ! -f "$traefik_dir/dynamic/routes.yml" ]; then
log_info "Creating placeholder routes.yml..."
cat > "$traefik_dir/dynamic/routes.yml" <<'ROUTESYML'
# Traefik Dynamic Routes for Remote Server
# Auto-generated by EZ-Homelab
#
# This file is watched by Traefik and reloaded automatically
# Add custom routes here if needed
http:
routers: {}
services: {}
middlewares: {}
ROUTESYML
log_success "Created routes.yml"
fi
# Verify docker-compose.yml exists
if [ ! -f "$traefik_dir/docker-compose.yml" ]; then
log_error "Traefik docker-compose.yml not found at $traefik_dir"
log_error "This should have been copied by copy_all_stacks_for_remote()"
return 1
fi
# Deploy # Deploy
log_info "Starting Traefik container..." log_info "Starting Traefik container..."
cd "$traefik_dir" cd "$traefik_dir"
if ! docker compose up -d; then if ! docker compose up -d; then
log_error "Failed to start Traefik stack" log_error "Failed to start Traefik stack"
log_error "Check logs: docker compose -f $traefik_dir/docker-compose.yml logs"
return 1 return 1
fi fi
log_success "Traefik stack deployed at $traefik_dir" # Verify container started
if docker ps | grep -q "traefik"; then
log_success "Traefik stack deployed and running at $traefik_dir"
else
log_warning "Traefik container may not be running, check: docker ps -a | grep traefik"
fi
} }
# Show help function # Show help function