Files
EZ-Homelab/scripts/enhanced-setup/monitor.sh
Kelin f141848a10 Add EZ-Homelab Enhanced Setup System
- Complete modular bash-based setup system replacing Python TUI
- Phase 1-4 implementation: Core Infrastructure, Configuration Management, Deployment Engine, Service Orchestration & Management
- 9 production-ready scripts: preflight.sh, setup.sh, pre-deployment-wizard.sh, localize.sh, generalize.sh, validate.sh, deploy.sh, service.sh, monitor.sh, backup.sh, update.sh
- Shared libraries: common.sh (utilities), ui.sh (text interface)
- Template-based configuration system with environment variable substitution
- Comprehensive documentation: PRD, standards, and quick reference guides
- Automated backup, monitoring, and update management capabilities
- Cross-platform compatibility with robust error handling and logging
2026-01-29 19:53:36 -05:00

577 lines
18 KiB
Bash
Executable File

#!/bin/bash
# EZ-Homelab Enhanced Setup Scripts - Service Monitoring
# Real-time service monitoring and alerting
SCRIPT_NAME="monitor"
SCRIPT_VERSION="1.0.0"
# Load common library
source "$(dirname "${BASH_SOURCE[0]}")/lib/common.sh"
source "$(dirname "${BASH_SOURCE[0]}")/lib/ui.sh"
# =============================================================================
# MONITORING CONFIGURATION
# =============================================================================
# Monitoring intervals (seconds)
HEALTH_CHECK_INTERVAL=30
RESOURCE_CHECK_INTERVAL=60
LOG_CHECK_INTERVAL=300
# Alert thresholds
CPU_THRESHOLD=80
MEMORY_THRESHOLD=80
DISK_THRESHOLD=90
# Alert cooldown (seconds) - prevent alert spam
ALERT_COOLDOWN=300
# Monitoring state file
MONITOR_STATE_FILE="$LOG_DIR/monitor_state.json"
# =============================================================================
# MONITORING STATE MANAGEMENT
# =============================================================================
# Initialize monitoring state
init_monitor_state() {
if [[ ! -f "$MONITOR_STATE_FILE" ]]; then
cat > "$MONITOR_STATE_FILE" << EOF
{
"services": {},
"alerts": {},
"last_check": $(date +%s),
"system_stats": {}
}
EOF
fi
}
# Update service state
update_service_state() {
local service="$1"
local status="$2"
local timestamp
timestamp=$(date +%s)
# Use jq if available, otherwise use sed
if command_exists "jq"; then
jq --arg service "$service" --arg status "$status" --argjson timestamp "$timestamp" \
'.services[$service] = {"status": $status, "last_update": $timestamp}' \
"$MONITOR_STATE_FILE" > "${MONITOR_STATE_FILE}.tmp" && mv "${MONITOR_STATE_FILE}.tmp" "$MONITOR_STATE_FILE"
else
# Simple fallback without jq
log_warn "jq not available, using basic state tracking"
fi
}
# Check if alert should be sent (cooldown check)
should_alert() {
local alert_key="$1"
local current_time
current_time=$(date +%s)
if command_exists "jq"; then
local last_alert
last_alert=$(jq -r ".alerts[\"$alert_key\"] // 0" "$MONITOR_STATE_FILE")
local time_diff=$((current_time - last_alert))
if (( time_diff >= ALERT_COOLDOWN )); then
# Update last alert time
jq --arg alert_key "$alert_key" --argjson timestamp "$current_time" \
'.alerts[$alert_key] = $timestamp' \
"$MONITOR_STATE_FILE" > "${MONITOR_STATE_FILE}.tmp" && mv "${MONITOR_STATE_FILE}.tmp" "$MONITOR_STATE_FILE"
return 0
else
return 1
fi
else
# Without jq, always alert (no cooldown)
return 0
fi
}
# =============================================================================
# HEALTH MONITORING FUNCTIONS
# =============================================================================
# Check service health
check_service_health() {
local service="$1"
if ! is_service_running "$service"; then
if should_alert "service_down_$service"; then
print_error "ALERT: Service '$service' is down"
log_error "Service '$service' is down"
fi
update_service_state "$service" "down"
return 1
fi
# Check container health status
local health_status
health_status=$(docker inspect "$service" --format '{{.State.Health.Status}}' 2>/dev/null || echo "unknown")
case "$health_status" in
"healthy")
update_service_state "$service" "healthy"
;;
"unhealthy")
if should_alert "service_unhealthy_$service"; then
print_warning "ALERT: Service '$service' is unhealthy"
log_warn "Service '$service' is unhealthy"
fi
update_service_state "$service" "unhealthy"
return 1
;;
"starting")
update_service_state "$service" "starting"
;;
*)
update_service_state "$service" "unknown"
;;
esac
return 0
}
# Check all services health
check_all_services_health() {
print_info "Checking service health..."
local services
mapfile -t services < <(find_all_services)
local unhealthy_count=0
for service in "${services[@]}"; do
if ! check_service_health "$service"; then
((unhealthy_count++))
fi
done
if (( unhealthy_count == 0 )); then
print_success "All services are healthy"
else
print_warning "$unhealthy_count service(s) have issues"
fi
}
# =============================================================================
# RESOURCE MONITORING FUNCTIONS
# =============================================================================
# Check system resources
check_system_resources() {
print_info "Checking system resources..."
# CPU usage
local cpu_usage
cpu_usage=$(top -bn1 | grep "Cpu(s)" | sed "s/.*, *\([0-9.]*\)%* id.*/\1/" | awk '{print 100 - $1}')
cpu_usage=$(printf "%.0f" "$cpu_usage")
if (( cpu_usage > CPU_THRESHOLD )); then
if should_alert "high_cpu"; then
print_error "ALERT: High CPU usage: ${cpu_usage}% (threshold: ${CPU_THRESHOLD}%)"
log_error "High CPU usage: ${cpu_usage}%"
fi
fi
# Memory usage
local memory_usage
memory_usage=$(free | grep Mem | awk '{printf "%.0f", $3/$2 * 100.0}')
if (( memory_usage > MEMORY_THRESHOLD )); then
if should_alert "high_memory"; then
print_error "ALERT: High memory usage: ${memory_usage}% (threshold: ${MEMORY_THRESHOLD}%)"
log_error "High memory usage: ${memory_usage}%"
fi
fi
# Disk usage
local disk_usage
disk_usage=$(df / | tail -1 | awk '{print $5}' | sed 's/%//')
if (( disk_usage > DISK_THRESHOLD )); then
if should_alert "high_disk"; then
print_error "ALERT: High disk usage: ${disk_usage}% (threshold: ${DISK_THRESHOLD}%)"
log_error "High disk usage: ${disk_usage}%"
fi
fi
print_info "CPU: ${cpu_usage}%, Memory: ${memory_usage}%, Disk: ${disk_usage}%"
}
# Check Docker resource usage
check_docker_resources() {
print_info "Checking Docker resources..."
# Get container resource usage
if command_exists "docker" && docker_available; then
local containers
mapfile -t containers < <(docker ps --format "{{.Names}}")
for container in "${containers[@]}"; do
local stats
stats=$(docker stats --no-stream --format "table {{.Container}}\t{{.CPUPerc}}\t{{.MemPerc}}" "$container" 2>/dev/null | tail -n 1)
if [[ -n "$stats" ]]; then
local cpu_perc mem_perc
cpu_perc=$(echo "$stats" | awk '{print $2}' | sed 's/%//')
mem_perc=$(echo "$stats" | awk '{print $3}' | sed 's/%//')
# Convert to numbers for comparison
cpu_perc=${cpu_perc%.*}
mem_perc=${mem_perc%.*}
if [[ "$cpu_perc" =~ ^[0-9]+$ ]] && (( cpu_perc > CPU_THRESHOLD )); then
if should_alert "container_high_cpu_$container"; then
print_warning "ALERT: Container '$container' high CPU: ${cpu_perc}%"
log_warn "Container '$container' high CPU: ${cpu_perc}%"
fi
fi
if [[ "$mem_perc" =~ ^[0-9]+$ ]] && (( mem_perc > MEMORY_THRESHOLD )); then
if should_alert "container_high_memory_$container"; then
print_warning "ALERT: Container '$container' high memory: ${mem_perc}%"
log_warn "Container '$container' high memory: ${mem_perc}%"
fi
fi
fi
done
fi
}
# =============================================================================
# LOG MONITORING FUNCTIONS
# =============================================================================
# Check service logs for errors
check_service_logs() {
local service="$1"
local since="${2:-1m}" # Default to last minute
if ! is_service_running "$service"; then
return 0
fi
local compose_file
compose_file=$(get_service_compose_file "$service")
if [[ -z "$compose_file" ]]; then
return 1
fi
local compose_dir=$(dirname "$compose_file")
local compose_base=$(basename "$compose_file")
# Check for error patterns in recent logs
local error_patterns=("ERROR" "error" "Exception" "failed" "Failed" "panic" "PANIC")
local errors_found=()
for pattern in "${error_patterns[@]}"; do
local error_count
error_count=$(cd "$compose_dir" && docker compose logs --since="$since" "$service" 2>&1 | grep -c "$pattern" || true)
if (( error_count > 0 )); then
errors_found+=("$pattern: $error_count")
fi
done
if [[ ${#errors_found[@]} -gt 0 ]]; then
if should_alert "log_errors_$service"; then
print_warning "ALERT: Service '$service' has errors in logs: ${errors_found[*]}"
log_warn "Service '$service' log errors: ${errors_found[*]}"
fi
fi
}
# Check all services logs
check_all_logs() {
print_info "Checking service logs for errors..."
local services
mapfile -t services < <(find_all_services)
for service in "${services[@]}"; do
check_service_logs "$service"
done
}
# =============================================================================
# MONITORING DISPLAY FUNCTIONS
# =============================================================================
# Display monitoring dashboard
show_monitoring_dashboard() {
print_info "EZ-Homelab Monitoring Dashboard"
echo
# System resources
echo "=== System Resources ==="
local cpu_usage memory_usage disk_usage
cpu_usage=$(top -bn1 | grep "Cpu(s)" | sed "s/.*, *\([0-9.]*\)%* id.*/\1/" | awk '{print 100 - $1}' || echo "0")
memory_usage=$(free | grep Mem | awk '{printf "%.0f", $3/$2 * 100.0}' || echo "0")
disk_usage=$(df / | tail -1 | awk '{print $5}' | sed 's/%//' || echo "0")
echo "CPU Usage: ${cpu_usage}%"
echo "Memory Usage: ${memory_usage}%"
echo "Disk Usage: ${disk_usage}%"
echo
# Service status summary
echo "=== Service Status ==="
local services=()
mapfile -t services < <(find_all_services)
local total_services=${#services[@]}
local running_services=0
local unhealthy_services=0
for service in "${services[@]}"; do
if is_service_running "$service"; then
running_services=$((running_services + 1))
local health_status
health_status=$(docker inspect "$service" --format '{{.State.Health.Status}}' 2>/dev/null || echo "unknown")
if [[ "$health_status" == "unhealthy" ]]; then
unhealthy_services=$((unhealthy_services + 1))
fi
fi
done
echo "Total Services: $total_services"
echo "Running: $running_services"
echo "Unhealthy: $unhealthy_services"
echo
# Recent alerts
echo "=== Recent Alerts ==="
if command_exists "jq" && [[ -f "$MONITOR_STATE_FILE" ]]; then
local recent_alerts
recent_alerts=$(jq -r '.alerts | to_entries[] | select(.value > (now - 3600)) | "\(.key): \(.value | strftime("%H:%M:%S"))"' "$MONITOR_STATE_FILE" 2>/dev/null || echo "")
if [[ -n "$recent_alerts" ]]; then
echo "$recent_alerts"
else
echo "No recent alerts (last hour)"
fi
else
echo "Alert history not available (jq not installed)"
fi
}
# Display detailed service status
show_detailed_status() {
local service="$1"
if [[ -z "$service" ]]; then
print_error "Service name required"
return 1
fi
print_info "Detailed Status for: $service"
echo
if ! is_service_running "$service"; then
echo "Status: ❌ Stopped"
return 0
fi
echo "Status: ✅ Running"
# Container details
local container_info
container_info=$(docker ps --filter "name=^${service}$" --format "table {{.Image}}\t{{.Status}}\t{{.Ports}}" | tail -n +2)
if [[ -n "$container_info" ]]; then
echo "Container: $container_info"
fi
# Health status
local health_status
health_status=$(docker inspect "$service" --format '{{.State.Health.Status}}' 2>/dev/null || echo "N/A")
echo "Health: $health_status"
# Resource usage
local stats
stats=$(docker stats --no-stream --format "table {{.CPUPerc}}\t{{.MemPerc}}\t{{.NetIO}}\t{{.BlockIO}}" "$service" 2>/dev/null | tail -n +2)
if [[ -n "$stats" ]]; then
echo "Resources: $stats"
fi
# Recent logs
echo
echo "Recent Logs:"
local compose_file
compose_file=$(get_service_compose_file "$service")
if [[ -n "$compose_file" ]]; then
local compose_dir=$(dirname "$compose_file")
local compose_base=$(basename "$compose_file")
(cd "$compose_dir" && docker compose logs --tail=5 "$service" 2>/dev/null || echo "No logs available")
fi
}
# =============================================================================
# CONTINUOUS MONITORING FUNCTIONS
# =============================================================================
# Run continuous monitoring
run_continuous_monitoring() {
local interval="${1:-$HEALTH_CHECK_INTERVAL}"
print_info "Starting continuous monitoring (interval: ${interval}s)"
print_info "Press Ctrl+C to stop"
# Initialize state
init_monitor_state
# Main monitoring loop
while true; do
local start_time
start_time=$(date +%s)
# Run all checks
check_all_services_health
check_system_resources
check_docker_resources
check_all_logs
# Update timestamp
if command_exists "jq"; then
jq --argjson timestamp "$(date +%s)" '.last_check = $timestamp' \
"$MONITOR_STATE_FILE" > "${MONITOR_STATE_FILE}.tmp" && mv "${MONITOR_STATE_FILE}.tmp" "$MONITOR_STATE_FILE"
fi
local end_time
end_time=$(date +%s)
local duration=$((end_time - start_time))
print_info "Monitoring cycle completed in ${duration}s. Next check in $((interval - duration))s..."
# Sleep for remaining time
local sleep_time=$((interval - duration))
if (( sleep_time > 0 )); then
sleep "$sleep_time"
fi
done
}
# =============================================================================
# MAIN FUNCTION
# =============================================================================
main() {
local action=""
local service=""
local interval="$HEALTH_CHECK_INTERVAL"
local continuous=false
# Parse command line arguments
while [[ $# -gt 0 ]]; do
case $1 in
-h|--help)
cat << EOF
EZ-Homelab Service Monitoring
USAGE:
monitor [OPTIONS] <ACTION> [SERVICE]
ACTIONS:
dashboard Show monitoring dashboard
status Show detailed status for a service
check Run all monitoring checks once
watch Continuous monitoring mode
OPTIONS:
-i, --interval SEC Monitoring interval in seconds (default: $HEALTH_CHECK_INTERVAL)
-c, --continuous Run in continuous mode (same as 'watch')
EXAMPLES:
monitor dashboard # Show monitoring dashboard
monitor status traefik # Show detailed status for Traefik
monitor check # Run all checks once
monitor watch # Start continuous monitoring
monitor watch -i 60 # Continuous monitoring every 60 seconds
EOF
exit 0
;;
-i|--interval)
interval="$2"
shift 2
;;
-c|--continuous)
continuous=true
shift
;;
dashboard|status|check|watch)
action="$1"
shift
break
;;
*)
if [[ -z "$service" ]]; then
service="$1"
else
print_error "Too many arguments"
exit 1
fi
shift
;;
esac
done
# Handle remaining arguments
while [[ $# -gt 0 ]]; do
if [[ -z "$service" ]]; then
service="$1"
else
print_error "Too many arguments"
exit 1
fi
shift
done
# Initialize script
init_script "$SCRIPT_NAME" "$SCRIPT_VERSION"
init_logging "$SCRIPT_NAME"
init_monitor_state
# Check prerequisites
if ! docker_available; then
print_error "Docker is not available"
exit 1
fi
# Execute action
case "$action" in
dashboard)
show_monitoring_dashboard
;;
status)
if [[ -n "$service" ]]; then
show_detailed_status "$service"
else
print_error "Service name required for status action"
exit 1
fi
;;
check)
check_all_services_health
check_system_resources
check_docker_resources
check_all_logs
;;
watch)
run_continuous_monitoring "$interval"
;;
"")
# Default action: show dashboard
show_monitoring_dashboard
;;
*)
print_error "Unknown action: $action"
exit 1
;;
esac
}
# Run main function
main "$@"