Fix multiserver deployment: Add SSH config automation, enhance registration verification, improve Traefik deployment
- Auto-create SSH config entry for passwordless core server access - Add pre-flight SSH connectivity check before registration - Verify docker-provider and sablier-middleware files are created on core - Display explicit success/failure messages with troubleshooting steps - Create placeholder routes.yml for Traefik dynamic config - Verify Traefik container starts successfully after deployment - Add container status check after Traefik deployment Fixes issues where remote server registration silently failed and Traefik deployment did not verify configuration files existed.
This commit is contained in:
@@ -352,6 +352,32 @@ setup_ssh_key_to_core() {
|
|||||||
"${CORE_SERVER_USER}@${CORE_SERVER_IP}" "echo 'SSH key authentication successful'" 2>&1 | grep -v "locale\|LC_ALL\|setlocale" | grep -q "successful"; then
|
"${CORE_SERVER_USER}@${CORE_SERVER_IP}" "echo 'SSH key authentication successful'" 2>&1 | grep -v "locale\|LC_ALL\|setlocale" | grep -q "successful"; then
|
||||||
log_success "SSH key authentication verified"
|
log_success "SSH key authentication verified"
|
||||||
|
|
||||||
|
# Add SSH config entry for automatic key usage
|
||||||
|
log_info "Adding SSH config entry for core server..."
|
||||||
|
local ssh_config="/home/$ACTUAL_USER/.ssh/config"
|
||||||
|
|
||||||
|
# Create config file if it doesn't exist
|
||||||
|
touch "$ssh_config"
|
||||||
|
chmod 600 "$ssh_config"
|
||||||
|
|
||||||
|
# Check if entry already exists
|
||||||
|
if ! grep -q "Host ${CORE_SERVER_HOSTNAME}" "$ssh_config" 2>/dev/null; then
|
||||||
|
cat >> "$ssh_config" <<SSHCONFIG
|
||||||
|
|
||||||
|
# Auto-generated by EZ-Homelab for remote server ${SERVER_HOSTNAME}
|
||||||
|
Host ${CORE_SERVER_HOSTNAME}
|
||||||
|
HostName ${CORE_SERVER_IP}
|
||||||
|
User ${CORE_SERVER_USER}
|
||||||
|
IdentityFile ${key_path}
|
||||||
|
StrictHostKeyChecking no
|
||||||
|
UserKnownHostsFile /dev/null
|
||||||
|
LogLevel ERROR
|
||||||
|
SSHCONFIG
|
||||||
|
log_success "SSH config entry added for ${CORE_SERVER_HOSTNAME}"
|
||||||
|
else
|
||||||
|
log_info "SSH config entry already exists"
|
||||||
|
fi
|
||||||
|
|
||||||
# Export key path for use by other functions
|
# Export key path for use by other functions
|
||||||
export SSH_KEY_PATH="$key_path"
|
export SSH_KEY_PATH="$key_path"
|
||||||
return 0
|
return 0
|
||||||
@@ -1330,6 +1356,14 @@ perform_deployment() {
|
|||||||
setup_docker_tls() {
|
setup_docker_tls() {
|
||||||
local TLS_DIR="/home/$ACTUAL_USER/EZ-Homelab/docker-tls"
|
local TLS_DIR="/home/$ACTUAL_USER/EZ-Homelab/docker-tls"
|
||||||
|
|
||||||
|
# Check if TLS is already configured
|
||||||
|
if [ -f "/etc/docker/daemon.json" ] && grep -q '"tls": true' /etc/docker/daemon.json 2>/dev/null; then
|
||||||
|
if systemctl cat docker.service | grep -q 'tcp://0.0.0.0:2376'; then
|
||||||
|
log_info "Docker TLS already configured, skipping..."
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
# Create TLS directory
|
# Create TLS directory
|
||||||
sudo mkdir -p "$TLS_DIR"
|
sudo mkdir -p "$TLS_DIR"
|
||||||
sudo chown "$ACTUAL_USER:$ACTUAL_USER" "$TLS_DIR"
|
sudo chown "$ACTUAL_USER:$ACTUAL_USER" "$TLS_DIR"
|
||||||
@@ -1368,13 +1402,33 @@ setup_docker_tls() {
|
|||||||
}
|
}
|
||||||
EOF
|
EOF
|
||||||
|
|
||||||
# Update systemd service
|
# Update systemd service only if not already configured (idempotent)
|
||||||
sudo sed -i 's|-H fd://|-H fd:// -H tcp://0.0.0.0:2376|' /lib/systemd/system/docker.service
|
if ! systemctl cat docker.service | grep -q 'tcp://0.0.0.0:2376'; then
|
||||||
|
log_info "Adding TCP socket to Docker service..."
|
||||||
|
sudo sed -i 's|^ExecStart=/usr/bin/dockerd -H fd://|ExecStart=/usr/bin/dockerd -H fd:// -H tcp://0.0.0.0:2376|' /lib/systemd/system/docker.service
|
||||||
|
else
|
||||||
|
# Clean up any duplicate TCP socket entries
|
||||||
|
if systemctl cat docker.service | grep -c 'tcp://0.0.0.0:2376' | grep -q '^[2-9]'; then
|
||||||
|
log_warning "Found duplicate TCP socket entries, cleaning up..."
|
||||||
|
# Extract the current ExecStart line and remove duplicates
|
||||||
|
local exec_start=$(systemctl cat docker.service | grep '^ExecStart=' | head -1)
|
||||||
|
local cleaned_exec=$(echo "$exec_start" | sed 's|-H tcp://0.0.0.0:2376||g')
|
||||||
|
cleaned_exec="${cleaned_exec} -H tcp://0.0.0.0:2376"
|
||||||
|
sudo sed -i "s|^ExecStart=.*|${cleaned_exec}|" /lib/systemd/system/docker.service
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
# Reload and restart Docker
|
# Reload and restart Docker
|
||||||
sudo systemctl daemon-reload
|
sudo systemctl daemon-reload
|
||||||
sudo systemctl restart docker
|
sudo systemctl restart docker
|
||||||
|
|
||||||
|
# Wait for Docker to be ready
|
||||||
|
sleep 3
|
||||||
|
if ! docker ps &>/dev/null; then
|
||||||
|
log_error "Docker failed to start after TLS configuration"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
|
||||||
log_success "Docker TLS configured on port 2376"
|
log_success "Docker TLS configured on port 2376"
|
||||||
}
|
}
|
||||||
setup_stacks_for_dockge() {
|
setup_stacks_for_dockge() {
|
||||||
@@ -1469,6 +1523,70 @@ show_main_menu() {
|
|||||||
# MULTI-SERVER DEPLOYMENT FUNCTIONS
|
# MULTI-SERVER DEPLOYMENT FUNCTIONS
|
||||||
# =============================================
|
# =============================================
|
||||||
|
|
||||||
|
# Clean up orphaned processes (important for resource-constrained servers)
|
||||||
|
cleanup_orphaned_processes() {
|
||||||
|
debug_log "Cleaning up orphaned processes"
|
||||||
|
|
||||||
|
# Kill zombie processes by killing their parent if possible
|
||||||
|
local zombies=$(ps aux | awk '$8 ~ /Z/ {print $2}')
|
||||||
|
if [ -n "$zombies" ]; then
|
||||||
|
log_warning "Found zombie processes, attempting cleanup..."
|
||||||
|
for zombie_pid in $zombies; do
|
||||||
|
local parent_pid=$(ps -o ppid= -p $zombie_pid 2>/dev/null | xargs)
|
||||||
|
if [ -n "$parent_pid" ] && [ "$parent_pid" != "1" ]; then
|
||||||
|
debug_log "Killing parent process $parent_pid to clean up zombie $zombie_pid"
|
||||||
|
sudo kill -SIGCHLD $parent_pid 2>/dev/null || true
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Kill any stuck docker compose logs processes older than 1 hour
|
||||||
|
local old_compose_logs=$(ps aux | grep 'docker compose logs' | grep -v grep | awk '$10 ~ /[0-9]+:[0-9]+:[0-9]+/ && $10 !~ /00:0[0-5]/ {print $2}')
|
||||||
|
if [ -n "$old_compose_logs" ]; then
|
||||||
|
log_warning "Found long-running docker compose logs processes, cleaning up..."
|
||||||
|
for pid in $old_compose_logs; do
|
||||||
|
debug_log "Killing docker compose logs process $pid"
|
||||||
|
sudo kill -9 $pid 2>/dev/null || true
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
|
||||||
|
log_success "Process cleanup complete"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Check system resources (important for resource-constrained servers)
|
||||||
|
check_system_resources() {
|
||||||
|
debug_log "Checking system resources"
|
||||||
|
|
||||||
|
# Check available memory
|
||||||
|
local mem_available=$(free -m | awk '/^Mem:/ {print $7}')
|
||||||
|
local mem_total=$(free -m | awk '/^Mem:/ {print $2}')
|
||||||
|
local mem_percent=$((mem_available * 100 / mem_total))
|
||||||
|
|
||||||
|
if [ $mem_percent -lt 20 ]; then
|
||||||
|
log_warning "Low memory available: ${mem_available}MB of ${mem_total}MB (${mem_percent}%)"
|
||||||
|
log_info "Consider closing other applications before deployment"
|
||||||
|
else
|
||||||
|
log_success "Memory check passed: ${mem_available}MB available (${mem_percent}%)"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Check disk space
|
||||||
|
local disk_available=$(df -m / | awk 'NR==2 {print $4}')
|
||||||
|
local disk_total=$(df -m / | awk 'NR==2 {print $2}')
|
||||||
|
local disk_percent=$((disk_available * 100 / disk_total))
|
||||||
|
|
||||||
|
if [ $disk_percent -lt 10 ]; then
|
||||||
|
log_error "Critical: Low disk space available: ${disk_available}MB of ${disk_total}MB (${disk_percent}%)"
|
||||||
|
log_error "Deployment may fail. Please free up disk space."
|
||||||
|
return 1
|
||||||
|
elif [ $disk_percent -lt 20 ]; then
|
||||||
|
log_warning "Low disk space: ${disk_available}MB of ${disk_total}MB (${disk_percent}%)"
|
||||||
|
else
|
||||||
|
log_success "Disk space check passed: ${disk_available}MB available (${disk_percent}%)"
|
||||||
|
fi
|
||||||
|
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
# Check if Docker is installed and accessible
|
# Check if Docker is installed and accessible
|
||||||
check_docker_installed() {
|
check_docker_installed() {
|
||||||
debug_log "Checking if Docker is installed"
|
debug_log "Checking if Docker is installed"
|
||||||
@@ -1529,6 +1647,12 @@ deploy_remote_server() {
|
|||||||
debug_log "Set ACTUAL_USER=$ACTUAL_USER"
|
debug_log "Set ACTUAL_USER=$ACTUAL_USER"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
# Clean up any orphaned processes before starting (important for resource-constrained servers)
|
||||||
|
cleanup_orphaned_processes
|
||||||
|
|
||||||
|
# Check system resources
|
||||||
|
check_system_resources
|
||||||
|
|
||||||
# Check Docker is installed
|
# Check Docker is installed
|
||||||
if ! check_docker_installed; then
|
if ! check_docker_installed; then
|
||||||
log_error "Docker must be installed before deploying remote server"
|
log_error "Docker must be installed before deploying remote server"
|
||||||
@@ -1636,26 +1760,82 @@ register_remote_server_with_core() {
|
|||||||
return 1
|
return 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
# Verify SSH key exists
|
||||||
|
if [ ! -f "$key_path" ]; then
|
||||||
|
log_error "SSH key not found: $key_path"
|
||||||
|
log_error "Please ensure setup_ssh_key_to_core() completed successfully"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
|
||||||
log_info "Connecting to core server to register this remote server..."
|
log_info "Connecting to core server to register this remote server..."
|
||||||
|
log_info "Using key: $key_path"
|
||||||
|
|
||||||
|
# Test SSH connection first
|
||||||
|
if ! LC_ALL=C ssh -i "$key_path" -o ConnectTimeout=5 -o StrictHostKeyChecking=no -o BatchMode=yes -o LogLevel=ERROR \
|
||||||
|
"${CORE_SERVER_USER}@${CORE_SERVER_IP}" "echo 'test'" 2>&1 | grep -q "test"; then
|
||||||
|
log_error "Cannot establish SSH connection to core server"
|
||||||
|
log_error "Please verify:"
|
||||||
|
echo " 1. SSH key is installed: ssh -i $key_path ${CORE_SERVER_USER}@${CORE_SERVER_IP}"
|
||||||
|
echo " 2. Core server is reachable: ping ${CORE_SERVER_IP}"
|
||||||
|
echo " 3. SSH service is running on core server"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
log_success "SSH connection verified"
|
||||||
|
|
||||||
# SSH to core server and run registration function
|
# SSH to core server and run registration function
|
||||||
LC_ALL=C ssh -i "$key_path" -o ConnectTimeout=10 -o StrictHostKeyChecking=no -o LogLevel=ERROR \
|
log_info "Running registration commands on core server..."
|
||||||
"${CORE_SERVER_USER}@${CORE_SERVER_IP}" bash <<EOF 2>&1 | grep -v "locale\|LC_ALL\|setlocale"
|
local ssh_output=$(LC_ALL=C ssh -i "$key_path" -o ConnectTimeout=10 -o StrictHostKeyChecking=no -o LogLevel=ERROR \
|
||||||
|
"${CORE_SERVER_USER}@${CORE_SERVER_IP}" bash <<EOF 2>&1
|
||||||
# Source common.sh to get registration function
|
# Source common.sh to get registration function
|
||||||
|
if [ -f ~/EZ-Homelab/scripts/common.sh ]; then
|
||||||
source ~/EZ-Homelab/scripts/common.sh
|
source ~/EZ-Homelab/scripts/common.sh
|
||||||
|
else
|
||||||
|
echo "ERROR: common.sh not found"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
# Register this remote server
|
# Register this remote server
|
||||||
add_remote_server_to_traefik "${SERVER_IP}" "${SERVER_HOSTNAME}"
|
add_remote_server_to_traefik "${SERVER_IP}" "${SERVER_HOSTNAME}"
|
||||||
|
|
||||||
|
# Verify files were created
|
||||||
|
if [ -f "/opt/stacks/core/traefik/dynamic/docker-provider-${SERVER_HOSTNAME}.yml" ]; then
|
||||||
|
echo "SUCCESS: docker-provider file created"
|
||||||
|
else
|
||||||
|
echo "ERROR: docker-provider file not created"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ -f "/opt/stacks/core/traefik/dynamic/sablier-middleware-${SERVER_HOSTNAME}.yml" ]; then
|
||||||
|
echo "SUCCESS: sablier-middleware file created"
|
||||||
|
else
|
||||||
|
echo "ERROR: sablier-middleware file not created"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
# Restart Traefik to reload configs
|
# Restart Traefik to reload configs
|
||||||
cd /opt/stacks/core
|
cd /opt/stacks/core
|
||||||
docker compose restart traefik
|
docker compose restart traefik
|
||||||
EOF
|
|
||||||
|
|
||||||
if [ $? -eq 0 ]; then
|
echo "SUCCESS: Registration complete"
|
||||||
|
EOF
|
||||||
|
)
|
||||||
|
|
||||||
|
local ssh_exit_code=$?
|
||||||
|
|
||||||
|
# Show output for debugging
|
||||||
|
echo "$ssh_output" | grep -v "locale\|LC_ALL\|setlocale"
|
||||||
|
|
||||||
|
if [ $ssh_exit_code -eq 0 ] && echo "$ssh_output" | grep -q "SUCCESS: Registration complete"; then
|
||||||
log_success "Successfully registered with core server"
|
log_success "Successfully registered with core server"
|
||||||
|
log_info "Files created on core server:"
|
||||||
|
echo " - /opt/stacks/core/traefik/dynamic/docker-provider-${SERVER_HOSTNAME}.yml"
|
||||||
|
echo " - /opt/stacks/core/traefik/dynamic/sablier-middleware-${SERVER_HOSTNAME}.yml"
|
||||||
|
return 0
|
||||||
else
|
else
|
||||||
log_error "Failed to register with core server via SSH"
|
log_error "Failed to register with core server via SSH"
|
||||||
|
log_error "SSH output:"
|
||||||
|
echo "$ssh_output"
|
||||||
return 1
|
return 1
|
||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
@@ -1786,15 +1966,46 @@ deploy_traefik_stack() {
|
|||||||
mkdir -p "$traefik_dir/config"
|
mkdir -p "$traefik_dir/config"
|
||||||
mkdir -p "$traefik_dir/dynamic"
|
mkdir -p "$traefik_dir/dynamic"
|
||||||
|
|
||||||
|
# Create placeholder routes.yml file in dynamic directory
|
||||||
|
if [ ! -f "$traefik_dir/dynamic/routes.yml" ]; then
|
||||||
|
log_info "Creating placeholder routes.yml..."
|
||||||
|
cat > "$traefik_dir/dynamic/routes.yml" <<'ROUTESYML'
|
||||||
|
# Traefik Dynamic Routes for Remote Server
|
||||||
|
# Auto-generated by EZ-Homelab
|
||||||
|
#
|
||||||
|
# This file is watched by Traefik and reloaded automatically
|
||||||
|
# Add custom routes here if needed
|
||||||
|
|
||||||
|
http:
|
||||||
|
routers: {}
|
||||||
|
services: {}
|
||||||
|
middlewares: {}
|
||||||
|
ROUTESYML
|
||||||
|
log_success "Created routes.yml"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Verify docker-compose.yml exists
|
||||||
|
if [ ! -f "$traefik_dir/docker-compose.yml" ]; then
|
||||||
|
log_error "Traefik docker-compose.yml not found at $traefik_dir"
|
||||||
|
log_error "This should have been copied by copy_all_stacks_for_remote()"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
|
||||||
# Deploy
|
# Deploy
|
||||||
log_info "Starting Traefik container..."
|
log_info "Starting Traefik container..."
|
||||||
cd "$traefik_dir"
|
cd "$traefik_dir"
|
||||||
if ! docker compose up -d; then
|
if ! docker compose up -d; then
|
||||||
log_error "Failed to start Traefik stack"
|
log_error "Failed to start Traefik stack"
|
||||||
|
log_error "Check logs: docker compose -f $traefik_dir/docker-compose.yml logs"
|
||||||
return 1
|
return 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
log_success "Traefik stack deployed at $traefik_dir"
|
# Verify container started
|
||||||
|
if docker ps | grep -q "traefik"; then
|
||||||
|
log_success "Traefik stack deployed and running at $traefik_dir"
|
||||||
|
else
|
||||||
|
log_warning "Traefik container may not be running, check: docker ps -a | grep traefik"
|
||||||
|
fi
|
||||||
}
|
}
|
||||||
|
|
||||||
# Show help function
|
# Show help function
|
||||||
|
|||||||
Reference in New Issue
Block a user