#!/bin/bash # Universal Multi-Domain Multi-Machine Health Monitor Script # This script reads domain configurations from monitor-domains.txt and monitors accordingly # Configuration CONFIG_FILE="monitor-domains.txt" SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" LOG_FILE="$SCRIPT_DIR/vm-health-monitor.log" TIMEOUT=10 MAX_RETRIES=3 STOP_START_DELAY=5 # Function to log messages with timestamp log_message() { echo "$(date '+%Y-%m-%d %H:%M:%S') - $1" | tee -a "$LOG_FILE" } # Function to check if URL is responding check_url() { local url="$1" local timeout="$2" # Add http:// if not present if [[ ! "$url" =~ ^https?:// ]]; then url="http://$url" fi # Use curl with options for both HTTP and HTTPS if curl -s -k -L --connect-timeout "$timeout" --max-time "$timeout" "$url" > /dev/null 2>&1; then return 0 # Success else return 1 # Failure fi } # Function to check if VM/LXC is running check_machine_status() { local machine_id="$1" local machine_type="$2" case "$machine_type" in "vm") if qm status "$machine_id" | grep -q "status: running"; then return 0 # Running else return 1 # Not running fi ;; "lxc") if pct status "$machine_id" | grep -q "status: running"; then return 0 # Running else return 1 # Not running fi ;; *) log_message "ERROR: Unknown machine type '$machine_type'" return 1 ;; esac } # Function to stop machine stop_machine() { local machine_id="$1" local machine_type="$2" local domain="$3" log_message "Attempting to stop $machine_type $machine_id for domain $domain" case "$machine_type" in "vm") if qm stop "$machine_id"; then log_message "✓ VM $machine_id stop command executed successfully" # Wait and verify it stopped local wait_count=0 while [ $wait_count -lt 30 ]; do # Wait up to 30 seconds if ! check_machine_status "$machine_id" "$machine_type"; then log_message "✓ VM $machine_id confirmed stopped" return 0 fi sleep 1 ((wait_count++)) done log_message "⚠ VM $machine_id stop command executed but status verification timed out" return 0 # Command succeeded even if verification timed out else log_message "✗ Failed to execute stop command for VM $machine_id" return 1 fi ;; "lxc") if pct stop "$machine_id"; then log_message "✓ LXC $machine_id stop command executed successfully" # Wait and verify it stopped local wait_count=0 while [ $wait_count -lt 30 ]; do # Wait up to 30 seconds if ! check_machine_status "$machine_id" "$machine_type"; then log_message "✓ LXC $machine_id confirmed stopped" return 0 fi sleep 1 ((wait_count++)) done log_message "⚠ LXC $machine_id stop command executed but status verification timed out" return 0 # Command succeeded even if verification timed out else log_message "✗ Failed to execute stop command for LXC $machine_id" return 1 fi ;; esac } # Function to start machine start_machine() { local machine_id="$1" local machine_type="$2" local domain="$3" log_message "Attempting to start $machine_type $machine_id for domain $domain" case "$machine_type" in "vm") if qm start "$machine_id"; then log_message "✓ VM $machine_id start command executed successfully" # Wait and verify it started local wait_count=0 while [ $wait_count -lt 60 ]; do # Wait up to 60 seconds for start if check_machine_status "$machine_id" "$machine_type"; then log_message "✓ VM $machine_id confirmed running" return 0 fi sleep 1 ((wait_count++)) done log_message "⚠ VM $machine_id start command executed but status verification timed out" return 0 # Command succeeded even if verification timed out else log_message "✗ Failed to execute start command for VM $machine_id" return 1 fi ;; "lxc") if pct start "$machine_id"; then log_message "✓ LXC $machine_id start command executed successfully" # Wait and verify it started local wait_count=0 while [ $wait_count -lt 60 ]; do # Wait up to 60 seconds for start if check_machine_status "$machine_id" "$machine_type"; then log_message "✓ LXC $machine_id confirmed running" return 0 fi sleep 1 ((wait_count++)) done log_message "⚠ LXC $machine_id start command executed but status verification timed out" return 0 # Command succeeded even if verification timed out else log_message "✗ Failed to execute start command for LXC $machine_id" return 1 fi ;; esac } # Function to restart machine restart_machine() { local machine_id="$1" local machine_type="$2" local domain="$3" log_message "=== INITIATING RESTART SEQUENCE FOR $domain ($machine_type $machine_id) ===" # Stop the machine if stop_machine "$machine_id" "$machine_type" "$domain"; then log_message "Waiting $STOP_START_DELAY seconds before starting..." sleep "$STOP_START_DELAY" # Start the machine if start_machine "$machine_id" "$machine_type" "$domain"; then log_message "=== RESTART SEQUENCE COMPLETED SUCCESSFULLY FOR $domain ===" return 0 else log_message "=== RESTART SEQUENCE FAILED AT START PHASE FOR $domain ===" return 1 fi else log_message "=== RESTART SEQUENCE FAILED AT STOP PHASE FOR $domain ===" return 1 fi } # Function to parse config line parse_config_line() { local line="$1" # Remove leading/trailing whitespace line=$(echo "$line" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//') # Skip empty lines and comments if [[ -z "$line" ]] || [[ "$line" =~ ^# ]]; then return 1 fi # Parse format: 'domain','type','id' if [[ "$line" =~ ^\'([^\']+)\',\'([^\']+)\',\'([^\']+)\'$ ]]; then PARSED_DOMAIN="${BASH_REMATCH[1]}" PARSED_TYPE="${BASH_REMATCH[2]}" PARSED_ID="${BASH_REMATCH[3]}" return 0 else log_message "ERROR: Invalid line format: $line" return 1 fi } # Function to validate config file validate_config_file() { local config_file="$1" local errors=0 local line_number=0 if [[ ! -f "$config_file" ]]; then log_message "ERROR: Configuration file '$config_file' not found" return 1 fi log_message "Validating configuration file: $config_file" while IFS= read -r line; do ((line_number++)) # Skip empty lines and comments if [[ -z "$(echo "$line" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')" ]] || [[ "$line" =~ ^[[:space:]]*# ]]; then continue fi if parse_config_line "$line"; then # Validate domain if [[ -z "$PARSED_DOMAIN" ]]; then log_message "ERROR: Empty domain at line $line_number" ((errors++)) fi # Validate type if [[ "$PARSED_TYPE" != "vm" ]] && [[ "$PARSED_TYPE" != "lxc" ]]; then log_message "ERROR: Invalid machine type '$PARSED_TYPE' at line $line_number (must be 'vm' or 'lxc')" ((errors++)) fi # Validate ID if [[ ! "$PARSED_ID" =~ ^[0-9]+$ ]]; then log_message "ERROR: Invalid machine ID '$PARSED_ID' at line $line_number (must be numeric)" ((errors++)) fi else if [[ ! "$line" =~ ^[[:space:]]*# ]] && [[ -n "$(echo "$line" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')" ]]; then log_message "ERROR: Invalid format at line $line_number: $line" ((errors++)) fi fi done < "$config_file" if [ $errors -gt 0 ]; then log_message "Configuration validation failed with $errors errors" return 1 else log_message "Configuration validation passed" return 0 fi } # Function to monitor single domain monitor_domain() { local domain="$1" local machine_type="$2" local machine_id="$3" log_message "Monitoring $domain -> $machine_type $machine_id" local retry_count=0 local domain_failed=true # Retry loop for this domain while [ $retry_count -lt $MAX_RETRIES ]; do if check_url "$domain" "$TIMEOUT"; then log_message "✓ $domain is responding (attempt $((retry_count + 1))/$MAX_RETRIES)" domain_failed=false break else retry_count=$((retry_count + 1)) if [ $retry_count -lt $MAX_RETRIES ]; then log_message "⚠ $domain not responding (attempt $retry_count/$MAX_RETRIES) - retrying in 5 seconds..." sleep 5 fi fi done if [ "$domain_failed" = true ]; then log_message "✗ $domain failed after $MAX_RETRIES attempts" log_message "Initiating restart for $machine_type $machine_id due to $domain failure" if restart_machine "$machine_id" "$machine_type" "$domain"; then log_message "Successfully restarted $machine_type $machine_id for $domain" return 0 else log_message "Failed to restart $machine_type $machine_id for $domain - manual intervention required" return 1 fi else log_message "✓ $domain is healthy - no action needed for $machine_type $machine_id" return 0 fi } # Main function main() { log_message "========================================" log_message "Starting Universal Multi-Domain Health Monitor" log_message "Script directory: $SCRIPT_DIR" log_message "Configuration file: $CONFIG_FILE" log_message "Log file: $LOG_FILE" log_message "========================================" # Check if config file exists in script directory local config_path="$SCRIPT_DIR/$CONFIG_FILE" if [[ ! -f "$config_path" ]]; then log_message "ERROR: Configuration file not found at $config_path" log_message "Please create $CONFIG_FILE in the same directory as this script" exit 1 fi # Validate configuration if ! validate_config_file "$config_path"; then log_message "Exiting due to configuration errors" exit 1 fi local total_domains=0 local successful_operations=0 local failed_operations=0 # Process each line in config file while IFS= read -r line; do # Skip empty lines and comments if [[ -z "$(echo "$line" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')" ]] || [[ "$line" =~ ^[[:space:]]*# ]]; then continue fi if parse_config_line "$line"; then ((total_domains++)) log_message "----------------------------------------" if monitor_domain "$PARSED_DOMAIN" "$PARSED_TYPE" "$PARSED_ID"; then ((successful_operations++)) else ((failed_operations++)) fi fi done < "$config_path" log_message "========================================" log_message "Health check completed" log_message "Total domains monitored: $total_domains" log_message "Successful operations: $successful_operations" log_message "Failed operations: $failed_operations" log_message "========================================" if [ $failed_operations -gt 0 ]; then log_message "WARNING: $failed_operations operation(s) failed - manual intervention may be required" exit 1 else log_message "All monitored services are healthy or successfully restarted" exit 0 fi } # Check if running as root if [ "$EUID" -ne 0 ]; then echo "This script must be run as root (required for Proxmox qm/pct commands)" exit 1 fi # Check dependencies command -v curl >/dev/null 2>&1 || { echo "Error: curl is not installed. Please install curl first." exit 1 } command -v qm >/dev/null 2>&1 || { echo "Error: qm command not found. This script must run on a Proxmox host." exit 1 } command -v pct >/dev/null 2>&1 || { echo "Error: pct command not found. This script must run on a Proxmox host." exit 1 } # Run main function main exit 0