Skip to main content

Universal Health Monitor - Configuration File Based

Script - Universal Health Monitor - Configuration File Based

A completely universal solution that reads from a configuration file, so users never need to touch the code.

1. Main Script: universal-health-monitor.sh
#!/bin/bash

# Universal Multi-Domain Multi-Machine Health Monitor Script
# This script reads domain configurations from monitor-domains.txt and monitors accordingly

# Configuration
CONFIG_FILE="monitor-domains.txt"
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
LOG_FILE="$SCRIPT_DIR/vm-health-monitor.log"
TIMEOUT=10
MAX_RETRIES=3
STOP_START_DELAY=5

# Function to log messages with timestamp
log_message() {
    echo "$(date '+%Y-%m-%d %H:%M:%S') - $1" | tee -a "$LOG_FILE"
}

# Function to check if URL is responding
check_url() {
    local url="$1"
    local timeout="$2"
    
    # Add http:// if not present
    if [[ ! "$url" =~ ^https?:// ]]; then
        url="http://$url"
    fi
    
    # Use curl with options for both HTTP and HTTPS
    if curl -s -k -L --connect-timeout "$timeout" --max-time "$timeout" "$url" > /dev/null 2>&1; then
        return 0  # Success
    else
        return 1  # Failure
    fi
}

# Function to check if VM/LXC is running
check_machine_status() {
    local machine_id="$1"
    local machine_type="$2"
    
    case "$machine_type" in
        "vm")
            if qm status "$machine_id" | grep -q "status: running"; then
                return 0  # Running
            else
                return 1  # Not running
            fi
            ;;
        "lxc")
            if pct status "$machine_id" | grep -q "status: running"; then
                return 0  # Running
            else
                return 1  # Not running
            fi
            ;;
        *)
            log_message "ERROR: Unknown machine type '$machine_type'"
            return 1
            ;;
    esac
}

# Function to stop machine
stop_machine() {
    local machine_id="$1"
    local machine_type="$2"
    local domain="$3"
    
    log_message "Attempting to stop $machine_type $machine_id for domain $domain"
    
    case "$machine_type" in
        "vm")
            if qm stop "$machine_id"; then
                log_message "✓ VM $machine_id stop command executed successfully"
                
                # Wait and verify it stopped
                local wait_count=0
                while [ $wait_count -lt 30 ]; do  # Wait up to 30 seconds
                    if ! check_machine_status "$machine_id" "$machine_type"; then
                        log_message "✓ VM $machine_id confirmed stopped"
                        return 0
                    fi
                    sleep 1
                    ((wait_count++))
                done
                
                log_message "⚠ VM $machine_id stop command executed but status verification timed out"
                return 0  # Command succeeded even if verification timed out
            else
                log_message "✗ Failed to execute stop command for VM $machine_id"
                return 1
            fi
            ;;
        "lxc")
            if pct stop "$machine_id"; then
                log_message "✓ LXC $machine_id stop command executed successfully"
                
                # Wait and verify it stopped
                local wait_count=0
                while [ $wait_count -lt 30 ]; do  # Wait up to 30 seconds
                    if ! check_machine_status "$machine_id" "$machine_type"; then
                        log_message "✓ LXC $machine_id confirmed stopped"
                        return 0
                    fi
                    sleep 1
                    ((wait_count++))
                done
                
                log_message "⚠ LXC $machine_id stop command executed but status verification timed out"
                return 0  # Command succeeded even if verification timed out
            else
                log_message "✗ Failed to execute stop command for LXC $machine_id"
                return 1
            fi
            ;;
    esac
}

# Function to start machine
start_machine() {
    local machine_id="$1"
    local machine_type="$2"
    local domain="$3"
    
    log_message "Attempting to start $machine_type $machine_id for domain $domain"
    
    case "$machine_type" in
        "vm")
            if qm start "$machine_id"; then
                log_message "✓ VM $machine_id start command executed successfully"
                
                # Wait and verify it started
                local wait_count=0
                while [ $wait_count -lt 60 ]; do  # Wait up to 60 seconds for start
                    if check_machine_status "$machine_id" "$machine_type"; then
                        log_message "✓ VM $machine_id confirmed running"
                        return 0
                    fi
                    sleep 1
                    ((wait_count++))
                done
                
                log_message "⚠ VM $machine_id start command executed but status verification timed out"
                return 0  # Command succeeded even if verification timed out
            else
                log_message "✗ Failed to execute start command for VM $machine_id"
                return 1
            fi
            ;;
        "lxc")
            if pct start "$machine_id"; then
                log_message "✓ LXC $machine_id start command executed successfully"
                
                # Wait and verify it started
                local wait_count=0
                while [ $wait_count -lt 60 ]; do  # Wait up to 60 seconds for start
                    if check_machine_status "$machine_id" "$machine_type"; then
                        log_message "✓ LXC $machine_id confirmed running"
                        return 0
                    fi
                    sleep 1
                    ((wait_count++))
                done
                
                log_message "⚠ LXC $machine_id start command executed but status verification timed out"
                return 0  # Command succeeded even if verification timed out
            else
                log_message "✗ Failed to execute start command for LXC $machine_id"
                return 1
            fi
            ;;
    esac
}

# Function to restart machine
restart_machine() {
    local machine_id="$1"
    local machine_type="$2"
    local domain="$3"
    
    log_message "=== INITIATING RESTART SEQUENCE FOR $domain ($machine_type $machine_id) ==="
    
    # Stop the machine
    if stop_machine "$machine_id" "$machine_type" "$domain"; then
        log_message "Waiting $STOP_START_DELAY seconds before starting..."
        sleep "$STOP_START_DELAY"
        
        # Start the machine
        if start_machine "$machine_id" "$machine_type" "$domain"; then
            log_message "=== RESTART SEQUENCE COMPLETED SUCCESSFULLY FOR $domain ==="
            return 0
        else
            log_message "=== RESTART SEQUENCE FAILED AT START PHASE FOR $domain ==="
            return 1
        fi
    else
        log_message "=== RESTART SEQUENCE FAILED AT STOP PHASE FOR $domain ==="
        return 1
    fi
}

# Function to parse config line
parse_config_line() {
    local line="$1"
    
    # Remove leading/trailing whitespace
    line=$(echo "$line" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
    
    # Skip empty lines and comments
    if [[ -z "$line" ]] || [[ "$line" =~ ^# ]]; then
        return 1
    fi
    
    # Parse format: 'domain','type','id'
    if [[ "$line" =~ ^\'([^\']+)\',\'([^\']+)\',\'([^\']+)\'$ ]]; then
        PARSED_DOMAIN="${BASH_REMATCH[1]}"
        PARSED_TYPE="${BASH_REMATCH[2]}"
        PARSED_ID="${BASH_REMATCH[3]}"
        return 0
    else
        log_message "ERROR: Invalid line format: $line"
        return 1
    fi
}

# Function to validate config file
validate_config_file() {
    local config_file="$1"
    local errors=0
    local line_number=0
    
    if [[ ! -f "$config_file" ]]; then
        log_message "ERROR: Configuration file '$config_file' not found"
        return 1
    fi
    
    log_message "Validating configuration file: $config_file"
    
    while IFS= read -r line; do
        ((line_number++))
        
        # Skip empty lines and comments
        if [[ -z "$(echo "$line" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')" ]] || [[ "$line" =~ ^[[:space:]]*# ]]; then
            continue
        fi
        
        if parse_config_line "$line"; then
            # Validate domain
            if [[ -z "$PARSED_DOMAIN" ]]; then
                log_message "ERROR: Empty domain at line $line_number"
                ((errors++))
            fi
            
            # Validate type
            if [[ "$PARSED_TYPE" != "vm" ]] && [[ "$PARSED_TYPE" != "lxc" ]]; then
                log_message "ERROR: Invalid machine type '$PARSED_TYPE' at line $line_number (must be 'vm' or 'lxc')"
                ((errors++))
            fi
            
            # Validate ID
            if [[ ! "$PARSED_ID" =~ ^[0-9]+$ ]]; then
                log_message "ERROR: Invalid machine ID '$PARSED_ID' at line $line_number (must be numeric)"
                ((errors++))
            fi
        else
            if [[ ! "$line" =~ ^[[:space:]]*# ]] && [[ -n "$(echo "$line" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')" ]]; then
                log_message "ERROR: Invalid format at line $line_number: $line"
                ((errors++))
            fi
        fi
    done < "$config_file"
    
    if [ $errors -gt 0 ]; then
        log_message "Configuration validation failed with $errors errors"
        return 1
    else
        log_message "Configuration validation passed"
        return 0
    fi
}

# Function to monitor single domain
monitor_domain() {
    local domain="$1"
    local machine_type="$2"
    local machine_id="$3"
    
    log_message "Monitoring $domain -> $machine_type $machine_id"
    
    local retry_count=0
    local domain_failed=true
    
    # Retry loop for this domain
    while [ $retry_count -lt $MAX_RETRIES ]; do
        if check_url "$domain" "$TIMEOUT"; then
            log_message "✓ $domain is responding (attempt $((retry_count + 1))/$MAX_RETRIES)"
            domain_failed=false
            break
        else
            retry_count=$((retry_count + 1))
            if [ $retry_count -lt $MAX_RETRIES ]; then
                log_message "⚠ $domain not responding (attempt $retry_count/$MAX_RETRIES) - retrying in 5 seconds..."
                sleep 5
            fi
        fi
    done
    
    if [ "$domain_failed" = true ]; then
        log_message "✗ $domain failed after $MAX_RETRIES attempts"
        log_message "Initiating restart for $machine_type $machine_id due to $domain failure"
        
        if restart_machine "$machine_id" "$machine_type" "$domain"; then
            log_message "Successfully restarted $machine_type $machine_id for $domain"
            return 0
        else
            log_message "Failed to restart $machine_type $machine_id for $domain - manual intervention required"
            return 1
        fi
    else
        log_message "✓ $domain is healthy - no action needed for $machine_type $machine_id"
        return 0
    fi
}

# Main function
main() {
    log_message "========================================"
    log_message "Starting Universal Multi-Domain Health Monitor"
    log_message "Script directory: $SCRIPT_DIR"
    log_message "Configuration file: $CONFIG_FILE"
    log_message "Log file: $LOG_FILE"
    log_message "========================================"
    
    # Check if config file exists in script directory
    local config_path="$SCRIPT_DIR/$CONFIG_FILE"
    
    if [[ ! -f "$config_path" ]]; then
        log_message "ERROR: Configuration file not found at $config_path"
        log_message "Please create $CONFIG_FILE in the same directory as this script"
        exit 1
    fi
    
    # Validate configuration
    if ! validate_config_file "$config_path"; then
        log_message "Exiting due to configuration errors"
        exit 1
    fi
    
    local total_domains=0
    local successful_operations=0
    local failed_operations=0
    
    # Process each line in config file
    while IFS= read -r line; do
        # Skip empty lines and comments
        if [[ -z "$(echo "$line" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')" ]] || [[ "$line" =~ ^[[:space:]]*# ]]; then
            continue
        fi
        
        if parse_config_line "$line"; then
            ((total_domains++))
            log_message "----------------------------------------"
            
            if monitor_domain "$PARSED_DOMAIN" "$PARSED_TYPE" "$PARSED_ID"; then
                ((successful_operations++))
            else
                ((failed_operations++))
            fi
        fi
    done < "$config_path"
    
    log_message "========================================"
    log_message "Health check completed"
    log_message "Total domains monitored: $total_domains"
    log_message "Successful operations: $successful_operations"
    log_message "Failed operations: $failed_operations"
    log_message "========================================"
    
    if [ $failed_operations -gt 0 ]; then
        log_message "WARNING: $failed_operations operation(s) failed - manual intervention may be required"
        exit 1
    else
        log_message "All monitored services are healthy or successfully restarted"
        exit 0
    fi
}

# Check if running as root
if [ "$EUID" -ne 0 ]; then
    echo "This script must be run as root (required for Proxmox qm/pct commands)"
    exit 1
fi

# Check dependencies
command -v curl >/dev/null 2>&1 || { 
    echo "Error: curl is not installed. Please install curl first."
    exit 1
}

command -v qm >/dev/null 2>&1 || { 
    echo "Error: qm command not found. This script must run on a Proxmox host."
    exit 1
}

command -v pct >/dev/null 2>&1 || { 
    echo "Error: pct command not found. This script must run on a Proxmox host."
    exit 1
}

# Run main function
main

exit 0
2. Configuration File: monitor-domains.txt

The user only needs to edit this file - no code touching required!

# Monitor Domains Configuration File
# Format: 'domain','type','id'
# 
# domain: The domain or IP to monitor (with or without http://)
# type: Either 'vm' or 'lxc'
# id: The numeric ID of the VM or LXC container
#
# Examples:
'www.domain-1.com','vm','101'
'www.domain-2.com','vm','102'
'www.domain-3.com','lxc','103'
'www.domain-4.com','lxc','104'
'192.168.1.100:8080','vm','105'
'api.example.com','lxc','106'
'https://secure.domain.com','vm','107'

# You can add comments like this
# 'disabled.domain.com','vm','108'

# Multiple entries are supported
'service1.local','vm','201'
'service2.local:9000','lxc','202'
Key Features


✅ User-Friendly Configuration

  • Zero code modification needed
  • Simple text file editing
  • Clear format with examples and comments
  • Automatic validation with helpful error messages

✅ Flexible Domain Support

  • Supports domains with or without protocol (domain.com or https://domain.com)
  • IP addresses with ports (192.168.1.100:8080)
  • Automatic http:// addition if missing

✅ Complete Status Verification

  • Executes qm stop XXX / pct stop XXX
  • Verifies the machine actually stopped
  • Waits 5 seconds (configurable delay)
  • Executes qm start XXX / pct start XXX
  • Verifies the machine actually started

✅ Comprehensive Logging

  • Log file created in same directory as script: vm-health-monitor.log
  • Timestamped entries
  • Status indicators (✓, ✗, ⚠)
  • Detailed restart sequences
  • Success/failure tracking

Installation & Usage

1. Setup Files

Place both files in the same directory (e.g., /root/)

/root/universal-health-monitor.sh
/root/monitor-domains.txt
2. Configure Your Domains

Edit monitor-domains.txt:

nano monitor-domains.txt

Add your domains using the exact format:

'your-domain.com','vm','101'
'192.168.1.50:8080','lxc','103'
'https://api.mysite.com','vm','105'
3. Make Executable & Test
chmod +x universal-health-monitor.sh
sudo ./universal-health-monitor.sh
4. Automate with Cron
crontab -e

Add the below line at the end of the file:

*/5 * * * * /root/universal-health-monitor.sh

Configuration Format Rules

✅ Valid Formats:
'domain.com','vm','101'
'192.168.1.100:8080','lxc','102'
'https://secure.com','vm','103'
'www.example.com','lxc','104'
❌ Invalid Formats:
domain.com,vm,101              # Missing quotes
'domain.com','VM','101'        # Wrong case (must be lowercase)
'domain.com','vm',101          # ID not quoted
'','vm','101'                  # Empty domain
'domain.com','server','101'    # Invalid type (must be 'vm' or 'lxc')
Sample Log Output
2025-08-03 18:45:01 - Starting Universal Multi-Domain Health Monitor
2025-08-03 18:45:01 - Configuration validation passed
2025-08-03 18:45:01 - ----------------------------------------
2025-08-03 18:45:01 - Monitoring www.domain-1.com -> vm 101
2025-08-03 18:45:02 - ✓ www.domain-1.com is responding (attempt 1/3)
2025-08-03 18:45:02 - ✓ www.domain-1.com is healthy - no action needed for vm 101
2025-08-03 18:45:02 - ----------------------------------------
2025-08-03 18:45:02 - Monitoring www.domain-2.com -> vm 102
2025-08-03 18:45:05 - ⚠ www.domain-2.com not responding (attempt 1/3) - retrying in 5 seconds...
2025-08-03 18:45:15 - ✗ www.domain-2.com failed after 3 attempts
2025-08-03 18:45:15 - === INITIATING RESTART SEQUENCE FOR www.domain-2.com (vm 102) ===
2025-08-03 18:45:15 - Attempting to stop vm 102 for domain www.domain-2.com
2025-08-03 18:45:18 - ✓ VM 102 stop command executed successfully
2025-08-03 18:45:19 - ✓ VM 102 confirmed stopped
2025-08-03 18:45:24 - Attempting to start vm 102 for domain www.domain-2.com
2025-08-03 18:45:26 - ✓ VM 102 start command executed successfully
2025-08-03 18:45:35 - ✓ VM 102 confirmed running
2025-08-03 18:45:35 - === RESTART SEQUENCE COMPLETED SUCCESSFULLY FOR www.domain-2.com ===

This solution is completely universal - users only need to edit the simple text configuration file, never touching any code!

Green