EZ-Homelab/scripts/enhanced-setup/monitor.sh

#!/bin/bash
# EZ-Homelab Enhanced Setup Scripts - Service Monitoring
# Real-time service monitoring and alerting

SCRIPT_NAME="monitor"
SCRIPT_VERSION="1.0.0"

# Load common library
source "$(dirname "${BASH_SOURCE[0]}")/lib/common.sh"
source "$(dirname "${BASH_SOURCE[0]}")/lib/ui.sh"

# =============================================================================
# MONITORING CONFIGURATION
# =============================================================================

# Monitoring intervals (seconds)
HEALTH_CHECK_INTERVAL=30
RESOURCE_CHECK_INTERVAL=60
LOG_CHECK_INTERVAL=300

# Alert thresholds
CPU_THRESHOLD=80
MEMORY_THRESHOLD=80
DISK_THRESHOLD=90

# Alert cooldown (seconds) - prevent alert spam
ALERT_COOLDOWN=300

# Monitoring state file
MONITOR_STATE_FILE="$LOG_DIR/monitor_state.json"

# =============================================================================
# MONITORING STATE MANAGEMENT
# =============================================================================

# Initialize monitoring state
init_monitor_state() {
    if [[ ! -f "$MONITOR_STATE_FILE" ]]; then
        cat > "$MONITOR_STATE_FILE" << EOF
{
  "services": {},
  "alerts": {},
  "last_check": $(date +%s),
  "system_stats": {}
}
EOF
    fi
}

# Update service state
update_service_state() {
    local service="$1"
    local status="$2"
    local timestamp
    timestamp=$(date +%s)

    # Use jq if available, otherwise use sed
    if command_exists "jq"; then
        jq --arg service "$service" --arg status "$status" --argjson timestamp "$timestamp" \
           '.services[$service] = {"status": $status, "last_update": $timestamp}' \
           "$MONITOR_STATE_FILE" > "${MONITOR_STATE_FILE}.tmp" && mv "${MONITOR_STATE_FILE}.tmp" "$MONITOR_STATE_FILE"
    else
        # Simple fallback without jq
        log_warn "jq not available, using basic state tracking"
    fi
}

# Check if alert should be sent (cooldown check)
should_alert() {
    local alert_key="$1"
    local current_time
    current_time=$(date +%s)

    if command_exists "jq"; then
        local last_alert
        last_alert=$(jq -r ".alerts[\"$alert_key\"] // 0" "$MONITOR_STATE_FILE")
        local time_diff=$((current_time - last_alert))

        if (( time_diff >= ALERT_COOLDOWN )); then
            # Update last alert time
            jq --arg alert_key "$alert_key" --argjson timestamp "$current_time" \
               '.alerts[$alert_key] = $timestamp' \
               "$MONITOR_STATE_FILE" > "${MONITOR_STATE_FILE}.tmp" && mv "${MONITOR_STATE_FILE}.tmp" "$MONITOR_STATE_FILE"
            return 0
        else
            return 1
        fi
    else
        # Without jq, always alert (no cooldown)
        return 0
    fi
}

# =============================================================================
# HEALTH MONITORING FUNCTIONS
# =============================================================================

# Check service health
check_service_health() {
    local service="$1"

    if ! is_service_running "$service"; then
        if should_alert "service_down_$service"; then
            print_error "ALERT: Service '$service' is down"
            log_error "Service '$service' is down"
        fi
        update_service_state "$service" "down"
        return 1
    fi

    # Check container health status
    local health_status
    health_status=$(docker inspect "$service" --format '{{.State.Health.Status}}' 2>/dev/null || echo "unknown")

    case "$health_status" in
        "healthy")
            update_service_state "$service" "healthy"
            ;;
        "unhealthy")
            if should_alert "service_unhealthy_$service"; then
                print_warning "ALERT: Service '$service' is unhealthy"
                log_warn "Service '$service' is unhealthy"
            fi
            update_service_state "$service" "unhealthy"
            return 1
            ;;
        "starting")
            update_service_state "$service" "starting"
            ;;
        *)
            update_service_state "$service" "unknown"
            ;;
    esac

    return 0
}

# Check all services health
check_all_services_health() {
    print_info "Checking service health..."

    local services
    mapfile -t services < <(find_all_services)
    local unhealthy_count=0

    for service in "${services[@]}"; do
        if ! check_service_health "$service"; then
            ((unhealthy_count++))
        fi
    done

    if (( unhealthy_count == 0 )); then
        print_success "All services are healthy"
    else
        print_warning "$unhealthy_count service(s) have issues"
    fi
}

# =============================================================================
# RESOURCE MONITORING FUNCTIONS
# =============================================================================

# Check system resources
check_system_resources() {
    print_info "Checking system resources..."

    # CPU usage
    local cpu_usage
    cpu_usage=$(top -bn1 | grep "Cpu(s)" | sed "s/.*, *\([0-9.]*\)%* id.*/\1/" | awk '{print 100 - $1}')
    cpu_usage=$(printf "%.0f" "$cpu_usage")

    if (( cpu_usage > CPU_THRESHOLD )); then
        if should_alert "high_cpu"; then
            print_error "ALERT: High CPU usage: ${cpu_usage}% (threshold: ${CPU_THRESHOLD}%)"
            log_error "High CPU usage: ${cpu_usage}%"
        fi
    fi

    # Memory usage
    local memory_usage
    memory_usage=$(free | grep Mem | awk '{printf "%.0f", $3/$2 * 100.0}')

    if (( memory_usage > MEMORY_THRESHOLD )); then
        if should_alert "high_memory"; then
            print_error "ALERT: High memory usage: ${memory_usage}% (threshold: ${MEMORY_THRESHOLD}%)"
            log_error "High memory usage: ${memory_usage}%"
        fi
    fi

    # Disk usage
    local disk_usage
    disk_usage=$(df / | tail -1 | awk '{print $5}' | sed 's/%//')

    if (( disk_usage > DISK_THRESHOLD )); then
        if should_alert "high_disk"; then
            print_error "ALERT: High disk usage: ${disk_usage}% (threshold: ${DISK_THRESHOLD}%)"
            log_error "High disk usage: ${disk_usage}%"
        fi
    fi

    print_info "CPU: ${cpu_usage}%, Memory: ${memory_usage}%, Disk: ${disk_usage}%"
}

# Check Docker resource usage
check_docker_resources() {
    print_info "Checking Docker resources..."

    # Get container resource usage
    if command_exists "docker" && docker_available; then
        local containers
        mapfile -t containers < <(docker ps --format "{{.Names}}")

        for container in "${containers[@]}"; do
            local stats
            stats=$(docker stats --no-stream --format "table {{.Container}}\t{{.CPUPerc}}\t{{.MemPerc}}" "$container" 2>/dev/null | tail -n 1)

            if [[ -n "$stats" ]]; then
                local cpu_perc mem_perc
                cpu_perc=$(echo "$stats" | awk '{print $2}' | sed 's/%//')
                mem_perc=$(echo "$stats" | awk '{print $3}' | sed 's/%//')

                # Convert to numbers for comparison
                cpu_perc=${cpu_perc%.*}
                mem_perc=${mem_perc%.*}

                if [[ "$cpu_perc" =~ ^[0-9]+$ ]] && (( cpu_perc > CPU_THRESHOLD )); then
                    if should_alert "container_high_cpu_$container"; then
                        print_warning "ALERT: Container '$container' high CPU: ${cpu_perc}%"
                        log_warn "Container '$container' high CPU: ${cpu_perc}%"
                    fi
                fi

                if [[ "$mem_perc" =~ ^[0-9]+$ ]] && (( mem_perc > MEMORY_THRESHOLD )); then
                    if should_alert "container_high_memory_$container"; then
                        print_warning "ALERT: Container '$container' high memory: ${mem_perc}%"
                        log_warn "Container '$container' high memory: ${mem_perc}%"
                    fi
                fi
            fi
        done
    fi
}

# =============================================================================
# LOG MONITORING FUNCTIONS
# =============================================================================

# Check service logs for errors
check_service_logs() {
    local service="$1"
    local since="${2:-1m}"  # Default to last minute

    if ! is_service_running "$service"; then
        return 0
    fi

    local compose_file
    compose_file=$(get_service_compose_file "$service")
    if [[ -z "$compose_file" ]]; then
        return 1
    fi

    local compose_dir=$(dirname "$compose_file")
    local compose_base=$(basename "$compose_file")

    # Check for error patterns in recent logs
    local error_patterns=("ERROR" "error" "Exception" "failed" "Failed" "panic" "PANIC")
    local errors_found=()

    for pattern in "${error_patterns[@]}"; do
        local error_count
        error_count=$(cd "$compose_dir" && docker compose logs --since="$since" "$service" 2>&1 | grep -c "$pattern" || true)

        if (( error_count > 0 )); then
            errors_found+=("$pattern: $error_count")
        fi
    done

    if [[ ${#errors_found[@]} -gt 0 ]]; then
        if should_alert "log_errors_$service"; then
            print_warning "ALERT: Service '$service' has errors in logs: ${errors_found[*]}"
            log_warn "Service '$service' log errors: ${errors_found[*]}"
        fi
    fi
}

# Check all services logs
check_all_logs() {
    print_info "Checking service logs for errors..."

    local services
    mapfile -t services < <(find_all_services)

    for service in "${services[@]}"; do
        check_service_logs "$service"
    done
}

# =============================================================================
# MONITORING DISPLAY FUNCTIONS
# =============================================================================

# Display monitoring dashboard
show_monitoring_dashboard() {
    print_info "EZ-Homelab Monitoring Dashboard"
    echo

    # System resources
    echo "=== System Resources ==="
    local cpu_usage memory_usage disk_usage
    cpu_usage=$(top -bn1 | grep "Cpu(s)" | sed "s/.*, *\([0-9.]*\)%* id.*/\1/" | awk '{print 100 - $1}' || echo "0")
    memory_usage=$(free | grep Mem | awk '{printf "%.0f", $3/$2 * 100.0}' || echo "0")
    disk_usage=$(df / | tail -1 | awk '{print $5}' | sed 's/%//' || echo "0")

    echo "CPU Usage:    ${cpu_usage}%"
    echo "Memory Usage: ${memory_usage}%"
    echo "Disk Usage:   ${disk_usage}%"
    echo

    # Service status summary
    echo "=== Service Status ==="
    local services=()
    mapfile -t services < <(find_all_services)
    local total_services=${#services[@]}
    local running_services=0
    local unhealthy_services=0

    for service in "${services[@]}"; do
        if is_service_running "$service"; then
            running_services=$((running_services + 1))

            local health_status
            health_status=$(docker inspect "$service" --format '{{.State.Health.Status}}' 2>/dev/null || echo "unknown")
            if [[ "$health_status" == "unhealthy" ]]; then
                unhealthy_services=$((unhealthy_services + 1))
            fi
        fi
    done

    echo "Total Services: $total_services"
    echo "Running:        $running_services"
    echo "Unhealthy:      $unhealthy_services"
    echo

    # Recent alerts
    echo "=== Recent Alerts ==="
    if command_exists "jq" && [[ -f "$MONITOR_STATE_FILE" ]]; then
        local recent_alerts
        recent_alerts=$(jq -r '.alerts | to_entries[] | select(.value > (now - 3600)) | "\(.key): \(.value | strftime("%H:%M:%S"))"' "$MONITOR_STATE_FILE" 2>/dev/null || echo "")

        if [[ -n "$recent_alerts" ]]; then
            echo "$recent_alerts"
        else
            echo "No recent alerts (last hour)"
        fi
    else
        echo "Alert history not available (jq not installed)"
    fi
}

# Display detailed service status
show_detailed_status() {
    local service="$1"

    if [[ -z "$service" ]]; then
        print_error "Service name required"
        return 1
    fi

    print_info "Detailed Status for: $service"
    echo

    if ! is_service_running "$service"; then
        echo "Status: ❌ Stopped"
        return 0
    fi

    echo "Status: ✅ Running"

    # Container details
    local container_info
    container_info=$(docker ps --filter "name=^${service}$" --format "table {{.Image}}\t{{.Status}}\t{{.Ports}}" | tail -n +2)
    if [[ -n "$container_info" ]]; then
        echo "Container: $container_info"
    fi

    # Health status
    local health_status
    health_status=$(docker inspect "$service" --format '{{.State.Health.Status}}' 2>/dev/null || echo "N/A")
    echo "Health: $health_status"

    # Resource usage
    local stats
    stats=$(docker stats --no-stream --format "table {{.CPUPerc}}\t{{.MemPerc}}\t{{.NetIO}}\t{{.BlockIO}}" "$service" 2>/dev/null | tail -n +2)
    if [[ -n "$stats" ]]; then
        echo "Resources: $stats"
    fi

    # Recent logs
    echo
    echo "Recent Logs:"
    local compose_file
    compose_file=$(get_service_compose_file "$service")
    if [[ -n "$compose_file" ]]; then
        local compose_dir=$(dirname "$compose_file")
        local compose_base=$(basename "$compose_file")
        (cd "$compose_dir" && docker compose logs --tail=5 "$service" 2>/dev/null || echo "No logs available")
    fi
}

# =============================================================================
# CONTINUOUS MONITORING FUNCTIONS
# =============================================================================

# Run continuous monitoring
run_continuous_monitoring() {
    local interval="${1:-$HEALTH_CHECK_INTERVAL}"

    print_info "Starting continuous monitoring (interval: ${interval}s)"
    print_info "Press Ctrl+C to stop"

    # Initialize state
    init_monitor_state

    # Main monitoring loop
    while true; do
        local start_time
        start_time=$(date +%s)

        # Run all checks
        check_all_services_health
        check_system_resources
        check_docker_resources
        check_all_logs

        # Update timestamp
        if command_exists "jq"; then
            jq --argjson timestamp "$(date +%s)" '.last_check = $timestamp' \
               "$MONITOR_STATE_FILE" > "${MONITOR_STATE_FILE}.tmp" && mv "${MONITOR_STATE_FILE}.tmp" "$MONITOR_STATE_FILE"
        fi

        local end_time
        end_time=$(date +%s)
        local duration=$((end_time - start_time))

        print_info "Monitoring cycle completed in ${duration}s. Next check in $((interval - duration))s..."

        # Sleep for remaining time
        local sleep_time=$((interval - duration))
        if (( sleep_time > 0 )); then
            sleep "$sleep_time"
        fi
    done
}

# =============================================================================
# MAIN FUNCTION
# =============================================================================

main() {
    local action=""
    local service=""
    local interval="$HEALTH_CHECK_INTERVAL"
    local continuous=false

    # Parse command line arguments
    while [[ $# -gt 0 ]]; do
        case $1 in
            -h|--help)
                cat << EOF
EZ-Homelab Service Monitoring

USAGE:
    monitor [OPTIONS] <ACTION> [SERVICE]

ACTIONS:
    dashboard     Show monitoring dashboard
    status        Show detailed status for a service
    check         Run all monitoring checks once
    watch         Continuous monitoring mode

OPTIONS:
    -i, --interval SEC    Monitoring interval in seconds (default: $HEALTH_CHECK_INTERVAL)
    -c, --continuous      Run in continuous mode (same as 'watch')

EXAMPLES:
    monitor dashboard           # Show monitoring dashboard
    monitor status traefik      # Show detailed status for Traefik
    monitor check               # Run all checks once
    monitor watch               # Start continuous monitoring
    monitor watch -i 60         # Continuous monitoring every 60 seconds

EOF
                exit 0
                ;;
            -i|--interval)
                interval="$2"
                shift 2
                ;;
            -c|--continuous)
                continuous=true
                shift
                ;;
            dashboard|status|check|watch)
                action="$1"
                shift
                break
                ;;
            *)
                if [[ -z "$service" ]]; then
                    service="$1"
                else
                    print_error "Too many arguments"
                    exit 1
                fi
                shift
                ;;
        esac
    done

    # Handle remaining arguments
    while [[ $# -gt 0 ]]; do
        if [[ -z "$service" ]]; then
            service="$1"
        else
            print_error "Too many arguments"
            exit 1
        fi
        shift
    done

    # Initialize script
    init_script "$SCRIPT_NAME" "$SCRIPT_VERSION"
    init_logging "$SCRIPT_NAME"
    init_monitor_state

    # Check prerequisites
    if ! docker_available; then
        print_error "Docker is not available"
        exit 1
    fi

    # Execute action
    case "$action" in
        dashboard)
            show_monitoring_dashboard
            ;;
        status)
            if [[ -n "$service" ]]; then
                show_detailed_status "$service"
            else
                print_error "Service name required for status action"
                exit 1
            fi
            ;;
        check)
            check_all_services_health
            check_system_resources
            check_docker_resources
            check_all_logs
            ;;
        watch)
            run_continuous_monitoring "$interval"
            ;;
        "")
            # Default action: show dashboard
            show_monitoring_dashboard
            ;;
        *)
            print_error "Unknown action: $action"
            exit 1
            ;;
    esac
}

# Run main function
main "$@"