You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

144 lines
4.3 KiB

#!/usr/bin/env python3
"""
System Resource Monitor - Logs system stats to help diagnose SSH connectivity issues.
This script monitors:
- CPU usage
- Memory usage
- Disk usage
- Network connectivity
- SSH service status
- System load
- Active connections
Run continuously to capture when the system becomes unreachable.
"""
import psutil
import time
import logging
from datetime import datetime
from pathlib import Path
# Setup logging to file with rotation
log_file = Path("/var/log/system_monitor.log")
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler(log_file),
logging.StreamHandler() # Also print to console
]
)
def check_ssh_service() -> dict:
"""
Check if SSH service is running.
Returns:
dict: Service status information
"""
try:
import subprocess
result = subprocess.run(
['systemctl', 'is-active', 'ssh'],
capture_output=True,
text=True,
timeout=5
)
return {
'running': result.returncode == 0,
'status': result.stdout.strip()
}
except Exception as e:
return {'running': False, 'error': str(e)}
def get_system_stats() -> dict:
"""
Collect current system statistics.
Returns:
dict: System statistics including CPU, memory, disk, network
"""
# CPU usage
cpu_percent = psutil.cpu_percent(interval=1)
cpu_count = psutil.cpu_count()
# Memory usage
memory = psutil.virtual_memory()
swap = psutil.swap_memory()
# Disk usage
disk = psutil.disk_usage('/')
# Network stats
net_io = psutil.net_io_counters()
# System load (1, 5, 15 minute averages)
load_avg = psutil.getloadavg()
# Number of connections
connections = len(psutil.net_connections())
return {
'cpu_percent': cpu_percent,
'cpu_count': cpu_count,
'memory_percent': memory.percent,
'memory_available_gb': memory.available / (1024**3),
'swap_percent': swap.percent,
'disk_percent': disk.percent,
'disk_free_gb': disk.free / (1024**3),
'network_bytes_sent': net_io.bytes_sent,
'network_bytes_recv': net_io.bytes_recv,
'load_1min': load_avg[0],
'load_5min': load_avg[1],
'load_15min': load_avg[2],
'connections': connections
}
def monitor_loop(interval_seconds: int = 60):
"""
Main monitoring loop that logs system stats at regular intervals.
Args:
interval_seconds: How often to log stats (default: 60 seconds)
"""
logging.info("Starting system monitoring...")
while True:
try:
stats = get_system_stats()
ssh_status = check_ssh_service()
# Log current stats
log_message = (
f"CPU: {stats['cpu_percent']:.1f}% | "
f"MEM: {stats['memory_percent']:.1f}% ({stats['memory_available_gb']:.2f}GB free) | "
f"DISK: {stats['disk_percent']:.1f}% ({stats['disk_free_gb']:.2f}GB free) | "
f"LOAD: {stats['load_1min']:.2f} {stats['load_5min']:.2f} {stats['load_15min']:.2f} | "
f"CONN: {stats['connections']} | "
f"SSH: {ssh_status.get('status', 'unknown')}"
)
# Warning thresholds
if stats['cpu_percent'] > 90:
logging.warning(f"HIGH CPU! {log_message}")
elif stats['memory_percent'] > 90:
logging.warning(f"HIGH MEMORY! {log_message}")
elif stats['disk_percent'] > 90:
logging.warning(f"HIGH DISK USAGE! {log_message}")
elif stats['load_1min'] > stats['cpu_count'] * 2:
logging.warning(f"HIGH LOAD! {log_message}")
elif not ssh_status.get('running'):
logging.error(f"SSH SERVICE DOWN! {log_message}")
else:
logging.info(log_message)
time.sleep(interval_seconds)
except Exception as e:
logging.error(f"Error in monitoring loop: {e}")
time.sleep(interval_seconds)
if __name__ == "__main__":
monitor_loop(interval_seconds=60) # Log every 60 seconds