- Complete modular bash-based setup system replacing Python TUI - Phase 1-4 implementation: Core Infrastructure, Configuration Management, Deployment Engine, Service Orchestration & Management - 9 production-ready scripts: preflight.sh, setup.sh, pre-deployment-wizard.sh, localize.sh, generalize.sh, validate.sh, deploy.sh, service.sh, monitor.sh, backup.sh, update.sh - Shared libraries: common.sh (utilities), ui.sh (text interface) - Template-based configuration system with environment variable substitution - Comprehensive documentation: PRD, standards, and quick reference guides - Automated backup, monitoring, and update management capabilities - Cross-platform compatibility with robust error handling and logging
577 lines
18 KiB
Bash
Executable File
577 lines
18 KiB
Bash
Executable File
#!/bin/bash
|
|
# EZ-Homelab Enhanced Setup Scripts - Service Monitoring
|
|
# Real-time service monitoring and alerting
|
|
|
|
SCRIPT_NAME="monitor"
|
|
SCRIPT_VERSION="1.0.0"
|
|
|
|
# Load common library
|
|
source "$(dirname "${BASH_SOURCE[0]}")/lib/common.sh"
|
|
source "$(dirname "${BASH_SOURCE[0]}")/lib/ui.sh"
|
|
|
|
# =============================================================================
|
|
# MONITORING CONFIGURATION
|
|
# =============================================================================
|
|
|
|
# Monitoring intervals (seconds)
|
|
HEALTH_CHECK_INTERVAL=30
|
|
RESOURCE_CHECK_INTERVAL=60
|
|
LOG_CHECK_INTERVAL=300
|
|
|
|
# Alert thresholds
|
|
CPU_THRESHOLD=80
|
|
MEMORY_THRESHOLD=80
|
|
DISK_THRESHOLD=90
|
|
|
|
# Alert cooldown (seconds) - prevent alert spam
|
|
ALERT_COOLDOWN=300
|
|
|
|
# Monitoring state file
|
|
MONITOR_STATE_FILE="$LOG_DIR/monitor_state.json"
|
|
|
|
# =============================================================================
|
|
# MONITORING STATE MANAGEMENT
|
|
# =============================================================================
|
|
|
|
# Initialize monitoring state
|
|
init_monitor_state() {
|
|
if [[ ! -f "$MONITOR_STATE_FILE" ]]; then
|
|
cat > "$MONITOR_STATE_FILE" << EOF
|
|
{
|
|
"services": {},
|
|
"alerts": {},
|
|
"last_check": $(date +%s),
|
|
"system_stats": {}
|
|
}
|
|
EOF
|
|
fi
|
|
}
|
|
|
|
# Update service state
|
|
update_service_state() {
|
|
local service="$1"
|
|
local status="$2"
|
|
local timestamp
|
|
timestamp=$(date +%s)
|
|
|
|
# Use jq if available, otherwise use sed
|
|
if command_exists "jq"; then
|
|
jq --arg service "$service" --arg status "$status" --argjson timestamp "$timestamp" \
|
|
'.services[$service] = {"status": $status, "last_update": $timestamp}' \
|
|
"$MONITOR_STATE_FILE" > "${MONITOR_STATE_FILE}.tmp" && mv "${MONITOR_STATE_FILE}.tmp" "$MONITOR_STATE_FILE"
|
|
else
|
|
# Simple fallback without jq
|
|
log_warn "jq not available, using basic state tracking"
|
|
fi
|
|
}
|
|
|
|
# Check if alert should be sent (cooldown check)
|
|
should_alert() {
|
|
local alert_key="$1"
|
|
local current_time
|
|
current_time=$(date +%s)
|
|
|
|
if command_exists "jq"; then
|
|
local last_alert
|
|
last_alert=$(jq -r ".alerts[\"$alert_key\"] // 0" "$MONITOR_STATE_FILE")
|
|
local time_diff=$((current_time - last_alert))
|
|
|
|
if (( time_diff >= ALERT_COOLDOWN )); then
|
|
# Update last alert time
|
|
jq --arg alert_key "$alert_key" --argjson timestamp "$current_time" \
|
|
'.alerts[$alert_key] = $timestamp' \
|
|
"$MONITOR_STATE_FILE" > "${MONITOR_STATE_FILE}.tmp" && mv "${MONITOR_STATE_FILE}.tmp" "$MONITOR_STATE_FILE"
|
|
return 0
|
|
else
|
|
return 1
|
|
fi
|
|
else
|
|
# Without jq, always alert (no cooldown)
|
|
return 0
|
|
fi
|
|
}
|
|
|
|
# =============================================================================
|
|
# HEALTH MONITORING FUNCTIONS
|
|
# =============================================================================
|
|
|
|
# Check service health
|
|
check_service_health() {
|
|
local service="$1"
|
|
|
|
if ! is_service_running "$service"; then
|
|
if should_alert "service_down_$service"; then
|
|
print_error "ALERT: Service '$service' is down"
|
|
log_error "Service '$service' is down"
|
|
fi
|
|
update_service_state "$service" "down"
|
|
return 1
|
|
fi
|
|
|
|
# Check container health status
|
|
local health_status
|
|
health_status=$(docker inspect "$service" --format '{{.State.Health.Status}}' 2>/dev/null || echo "unknown")
|
|
|
|
case "$health_status" in
|
|
"healthy")
|
|
update_service_state "$service" "healthy"
|
|
;;
|
|
"unhealthy")
|
|
if should_alert "service_unhealthy_$service"; then
|
|
print_warning "ALERT: Service '$service' is unhealthy"
|
|
log_warn "Service '$service' is unhealthy"
|
|
fi
|
|
update_service_state "$service" "unhealthy"
|
|
return 1
|
|
;;
|
|
"starting")
|
|
update_service_state "$service" "starting"
|
|
;;
|
|
*)
|
|
update_service_state "$service" "unknown"
|
|
;;
|
|
esac
|
|
|
|
return 0
|
|
}
|
|
|
|
# Check all services health
|
|
check_all_services_health() {
|
|
print_info "Checking service health..."
|
|
|
|
local services
|
|
mapfile -t services < <(find_all_services)
|
|
local unhealthy_count=0
|
|
|
|
for service in "${services[@]}"; do
|
|
if ! check_service_health "$service"; then
|
|
((unhealthy_count++))
|
|
fi
|
|
done
|
|
|
|
if (( unhealthy_count == 0 )); then
|
|
print_success "All services are healthy"
|
|
else
|
|
print_warning "$unhealthy_count service(s) have issues"
|
|
fi
|
|
}
|
|
|
|
# =============================================================================
|
|
# RESOURCE MONITORING FUNCTIONS
|
|
# =============================================================================
|
|
|
|
# Check system resources
|
|
check_system_resources() {
|
|
print_info "Checking system resources..."
|
|
|
|
# CPU usage
|
|
local cpu_usage
|
|
cpu_usage=$(top -bn1 | grep "Cpu(s)" | sed "s/.*, *\([0-9.]*\)%* id.*/\1/" | awk '{print 100 - $1}')
|
|
cpu_usage=$(printf "%.0f" "$cpu_usage")
|
|
|
|
if (( cpu_usage > CPU_THRESHOLD )); then
|
|
if should_alert "high_cpu"; then
|
|
print_error "ALERT: High CPU usage: ${cpu_usage}% (threshold: ${CPU_THRESHOLD}%)"
|
|
log_error "High CPU usage: ${cpu_usage}%"
|
|
fi
|
|
fi
|
|
|
|
# Memory usage
|
|
local memory_usage
|
|
memory_usage=$(free | grep Mem | awk '{printf "%.0f", $3/$2 * 100.0}')
|
|
|
|
if (( memory_usage > MEMORY_THRESHOLD )); then
|
|
if should_alert "high_memory"; then
|
|
print_error "ALERT: High memory usage: ${memory_usage}% (threshold: ${MEMORY_THRESHOLD}%)"
|
|
log_error "High memory usage: ${memory_usage}%"
|
|
fi
|
|
fi
|
|
|
|
# Disk usage
|
|
local disk_usage
|
|
disk_usage=$(df / | tail -1 | awk '{print $5}' | sed 's/%//')
|
|
|
|
if (( disk_usage > DISK_THRESHOLD )); then
|
|
if should_alert "high_disk"; then
|
|
print_error "ALERT: High disk usage: ${disk_usage}% (threshold: ${DISK_THRESHOLD}%)"
|
|
log_error "High disk usage: ${disk_usage}%"
|
|
fi
|
|
fi
|
|
|
|
print_info "CPU: ${cpu_usage}%, Memory: ${memory_usage}%, Disk: ${disk_usage}%"
|
|
}
|
|
|
|
# Check Docker resource usage
|
|
check_docker_resources() {
|
|
print_info "Checking Docker resources..."
|
|
|
|
# Get container resource usage
|
|
if command_exists "docker" && docker_available; then
|
|
local containers
|
|
mapfile -t containers < <(docker ps --format "{{.Names}}")
|
|
|
|
for container in "${containers[@]}"; do
|
|
local stats
|
|
stats=$(docker stats --no-stream --format "table {{.Container}}\t{{.CPUPerc}}\t{{.MemPerc}}" "$container" 2>/dev/null | tail -n 1)
|
|
|
|
if [[ -n "$stats" ]]; then
|
|
local cpu_perc mem_perc
|
|
cpu_perc=$(echo "$stats" | awk '{print $2}' | sed 's/%//')
|
|
mem_perc=$(echo "$stats" | awk '{print $3}' | sed 's/%//')
|
|
|
|
# Convert to numbers for comparison
|
|
cpu_perc=${cpu_perc%.*}
|
|
mem_perc=${mem_perc%.*}
|
|
|
|
if [[ "$cpu_perc" =~ ^[0-9]+$ ]] && (( cpu_perc > CPU_THRESHOLD )); then
|
|
if should_alert "container_high_cpu_$container"; then
|
|
print_warning "ALERT: Container '$container' high CPU: ${cpu_perc}%"
|
|
log_warn "Container '$container' high CPU: ${cpu_perc}%"
|
|
fi
|
|
fi
|
|
|
|
if [[ "$mem_perc" =~ ^[0-9]+$ ]] && (( mem_perc > MEMORY_THRESHOLD )); then
|
|
if should_alert "container_high_memory_$container"; then
|
|
print_warning "ALERT: Container '$container' high memory: ${mem_perc}%"
|
|
log_warn "Container '$container' high memory: ${mem_perc}%"
|
|
fi
|
|
fi
|
|
fi
|
|
done
|
|
fi
|
|
}
|
|
|
|
# =============================================================================
|
|
# LOG MONITORING FUNCTIONS
|
|
# =============================================================================
|
|
|
|
# Check service logs for errors
|
|
check_service_logs() {
|
|
local service="$1"
|
|
local since="${2:-1m}" # Default to last minute
|
|
|
|
if ! is_service_running "$service"; then
|
|
return 0
|
|
fi
|
|
|
|
local compose_file
|
|
compose_file=$(get_service_compose_file "$service")
|
|
if [[ -z "$compose_file" ]]; then
|
|
return 1
|
|
fi
|
|
|
|
local compose_dir=$(dirname "$compose_file")
|
|
local compose_base=$(basename "$compose_file")
|
|
|
|
# Check for error patterns in recent logs
|
|
local error_patterns=("ERROR" "error" "Exception" "failed" "Failed" "panic" "PANIC")
|
|
local errors_found=()
|
|
|
|
for pattern in "${error_patterns[@]}"; do
|
|
local error_count
|
|
error_count=$(cd "$compose_dir" && docker compose logs --since="$since" "$service" 2>&1 | grep -c "$pattern" || true)
|
|
|
|
if (( error_count > 0 )); then
|
|
errors_found+=("$pattern: $error_count")
|
|
fi
|
|
done
|
|
|
|
if [[ ${#errors_found[@]} -gt 0 ]]; then
|
|
if should_alert "log_errors_$service"; then
|
|
print_warning "ALERT: Service '$service' has errors in logs: ${errors_found[*]}"
|
|
log_warn "Service '$service' log errors: ${errors_found[*]}"
|
|
fi
|
|
fi
|
|
}
|
|
|
|
# Check all services logs
|
|
check_all_logs() {
|
|
print_info "Checking service logs for errors..."
|
|
|
|
local services
|
|
mapfile -t services < <(find_all_services)
|
|
|
|
for service in "${services[@]}"; do
|
|
check_service_logs "$service"
|
|
done
|
|
}
|
|
|
|
# =============================================================================
|
|
# MONITORING DISPLAY FUNCTIONS
|
|
# =============================================================================
|
|
|
|
# Display monitoring dashboard
|
|
show_monitoring_dashboard() {
|
|
print_info "EZ-Homelab Monitoring Dashboard"
|
|
echo
|
|
|
|
# System resources
|
|
echo "=== System Resources ==="
|
|
local cpu_usage memory_usage disk_usage
|
|
cpu_usage=$(top -bn1 | grep "Cpu(s)" | sed "s/.*, *\([0-9.]*\)%* id.*/\1/" | awk '{print 100 - $1}' || echo "0")
|
|
memory_usage=$(free | grep Mem | awk '{printf "%.0f", $3/$2 * 100.0}' || echo "0")
|
|
disk_usage=$(df / | tail -1 | awk '{print $5}' | sed 's/%//' || echo "0")
|
|
|
|
echo "CPU Usage: ${cpu_usage}%"
|
|
echo "Memory Usage: ${memory_usage}%"
|
|
echo "Disk Usage: ${disk_usage}%"
|
|
echo
|
|
|
|
# Service status summary
|
|
echo "=== Service Status ==="
|
|
local services=()
|
|
mapfile -t services < <(find_all_services)
|
|
local total_services=${#services[@]}
|
|
local running_services=0
|
|
local unhealthy_services=0
|
|
|
|
for service in "${services[@]}"; do
|
|
if is_service_running "$service"; then
|
|
running_services=$((running_services + 1))
|
|
|
|
local health_status
|
|
health_status=$(docker inspect "$service" --format '{{.State.Health.Status}}' 2>/dev/null || echo "unknown")
|
|
if [[ "$health_status" == "unhealthy" ]]; then
|
|
unhealthy_services=$((unhealthy_services + 1))
|
|
fi
|
|
fi
|
|
done
|
|
|
|
echo "Total Services: $total_services"
|
|
echo "Running: $running_services"
|
|
echo "Unhealthy: $unhealthy_services"
|
|
echo
|
|
|
|
# Recent alerts
|
|
echo "=== Recent Alerts ==="
|
|
if command_exists "jq" && [[ -f "$MONITOR_STATE_FILE" ]]; then
|
|
local recent_alerts
|
|
recent_alerts=$(jq -r '.alerts | to_entries[] | select(.value > (now - 3600)) | "\(.key): \(.value | strftime("%H:%M:%S"))"' "$MONITOR_STATE_FILE" 2>/dev/null || echo "")
|
|
|
|
if [[ -n "$recent_alerts" ]]; then
|
|
echo "$recent_alerts"
|
|
else
|
|
echo "No recent alerts (last hour)"
|
|
fi
|
|
else
|
|
echo "Alert history not available (jq not installed)"
|
|
fi
|
|
}
|
|
|
|
# Display detailed service status
|
|
show_detailed_status() {
|
|
local service="$1"
|
|
|
|
if [[ -z "$service" ]]; then
|
|
print_error "Service name required"
|
|
return 1
|
|
fi
|
|
|
|
print_info "Detailed Status for: $service"
|
|
echo
|
|
|
|
if ! is_service_running "$service"; then
|
|
echo "Status: ❌ Stopped"
|
|
return 0
|
|
fi
|
|
|
|
echo "Status: ✅ Running"
|
|
|
|
# Container details
|
|
local container_info
|
|
container_info=$(docker ps --filter "name=^${service}$" --format "table {{.Image}}\t{{.Status}}\t{{.Ports}}" | tail -n +2)
|
|
if [[ -n "$container_info" ]]; then
|
|
echo "Container: $container_info"
|
|
fi
|
|
|
|
# Health status
|
|
local health_status
|
|
health_status=$(docker inspect "$service" --format '{{.State.Health.Status}}' 2>/dev/null || echo "N/A")
|
|
echo "Health: $health_status"
|
|
|
|
# Resource usage
|
|
local stats
|
|
stats=$(docker stats --no-stream --format "table {{.CPUPerc}}\t{{.MemPerc}}\t{{.NetIO}}\t{{.BlockIO}}" "$service" 2>/dev/null | tail -n +2)
|
|
if [[ -n "$stats" ]]; then
|
|
echo "Resources: $stats"
|
|
fi
|
|
|
|
# Recent logs
|
|
echo
|
|
echo "Recent Logs:"
|
|
local compose_file
|
|
compose_file=$(get_service_compose_file "$service")
|
|
if [[ -n "$compose_file" ]]; then
|
|
local compose_dir=$(dirname "$compose_file")
|
|
local compose_base=$(basename "$compose_file")
|
|
(cd "$compose_dir" && docker compose logs --tail=5 "$service" 2>/dev/null || echo "No logs available")
|
|
fi
|
|
}
|
|
|
|
# =============================================================================
|
|
# CONTINUOUS MONITORING FUNCTIONS
|
|
# =============================================================================
|
|
|
|
# Run continuous monitoring
|
|
run_continuous_monitoring() {
|
|
local interval="${1:-$HEALTH_CHECK_INTERVAL}"
|
|
|
|
print_info "Starting continuous monitoring (interval: ${interval}s)"
|
|
print_info "Press Ctrl+C to stop"
|
|
|
|
# Initialize state
|
|
init_monitor_state
|
|
|
|
# Main monitoring loop
|
|
while true; do
|
|
local start_time
|
|
start_time=$(date +%s)
|
|
|
|
# Run all checks
|
|
check_all_services_health
|
|
check_system_resources
|
|
check_docker_resources
|
|
check_all_logs
|
|
|
|
# Update timestamp
|
|
if command_exists "jq"; then
|
|
jq --argjson timestamp "$(date +%s)" '.last_check = $timestamp' \
|
|
"$MONITOR_STATE_FILE" > "${MONITOR_STATE_FILE}.tmp" && mv "${MONITOR_STATE_FILE}.tmp" "$MONITOR_STATE_FILE"
|
|
fi
|
|
|
|
local end_time
|
|
end_time=$(date +%s)
|
|
local duration=$((end_time - start_time))
|
|
|
|
print_info "Monitoring cycle completed in ${duration}s. Next check in $((interval - duration))s..."
|
|
|
|
# Sleep for remaining time
|
|
local sleep_time=$((interval - duration))
|
|
if (( sleep_time > 0 )); then
|
|
sleep "$sleep_time"
|
|
fi
|
|
done
|
|
}
|
|
|
|
# =============================================================================
|
|
# MAIN FUNCTION
|
|
# =============================================================================
|
|
|
|
main() {
|
|
local action=""
|
|
local service=""
|
|
local interval="$HEALTH_CHECK_INTERVAL"
|
|
local continuous=false
|
|
|
|
# Parse command line arguments
|
|
while [[ $# -gt 0 ]]; do
|
|
case $1 in
|
|
-h|--help)
|
|
cat << EOF
|
|
EZ-Homelab Service Monitoring
|
|
|
|
USAGE:
|
|
monitor [OPTIONS] <ACTION> [SERVICE]
|
|
|
|
ACTIONS:
|
|
dashboard Show monitoring dashboard
|
|
status Show detailed status for a service
|
|
check Run all monitoring checks once
|
|
watch Continuous monitoring mode
|
|
|
|
OPTIONS:
|
|
-i, --interval SEC Monitoring interval in seconds (default: $HEALTH_CHECK_INTERVAL)
|
|
-c, --continuous Run in continuous mode (same as 'watch')
|
|
|
|
EXAMPLES:
|
|
monitor dashboard # Show monitoring dashboard
|
|
monitor status traefik # Show detailed status for Traefik
|
|
monitor check # Run all checks once
|
|
monitor watch # Start continuous monitoring
|
|
monitor watch -i 60 # Continuous monitoring every 60 seconds
|
|
|
|
EOF
|
|
exit 0
|
|
;;
|
|
-i|--interval)
|
|
interval="$2"
|
|
shift 2
|
|
;;
|
|
-c|--continuous)
|
|
continuous=true
|
|
shift
|
|
;;
|
|
dashboard|status|check|watch)
|
|
action="$1"
|
|
shift
|
|
break
|
|
;;
|
|
*)
|
|
if [[ -z "$service" ]]; then
|
|
service="$1"
|
|
else
|
|
print_error "Too many arguments"
|
|
exit 1
|
|
fi
|
|
shift
|
|
;;
|
|
esac
|
|
done
|
|
|
|
# Handle remaining arguments
|
|
while [[ $# -gt 0 ]]; do
|
|
if [[ -z "$service" ]]; then
|
|
service="$1"
|
|
else
|
|
print_error "Too many arguments"
|
|
exit 1
|
|
fi
|
|
shift
|
|
done
|
|
|
|
# Initialize script
|
|
init_script "$SCRIPT_NAME" "$SCRIPT_VERSION"
|
|
init_logging "$SCRIPT_NAME"
|
|
init_monitor_state
|
|
|
|
# Check prerequisites
|
|
if ! docker_available; then
|
|
print_error "Docker is not available"
|
|
exit 1
|
|
fi
|
|
|
|
# Execute action
|
|
case "$action" in
|
|
dashboard)
|
|
show_monitoring_dashboard
|
|
;;
|
|
status)
|
|
if [[ -n "$service" ]]; then
|
|
show_detailed_status "$service"
|
|
else
|
|
print_error "Service name required for status action"
|
|
exit 1
|
|
fi
|
|
;;
|
|
check)
|
|
check_all_services_health
|
|
check_system_resources
|
|
check_docker_resources
|
|
check_all_logs
|
|
;;
|
|
watch)
|
|
run_continuous_monitoring "$interval"
|
|
;;
|
|
"")
|
|
# Default action: show dashboard
|
|
show_monitoring_dashboard
|
|
;;
|
|
*)
|
|
print_error "Unknown action: $action"
|
|
exit 1
|
|
;;
|
|
esac
|
|
}
|
|
|
|
# Run main function
|
|
main "$@" |