You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
144 lines
4.3 KiB
144 lines
4.3 KiB
#!/usr/bin/env python3 |
|
""" |
|
System Resource Monitor - Logs system stats to help diagnose SSH connectivity issues. |
|
|
|
This script monitors: |
|
- CPU usage |
|
- Memory usage |
|
- Disk usage |
|
- Network connectivity |
|
- SSH service status |
|
- System load |
|
- Active connections |
|
|
|
Run continuously to capture when the system becomes unreachable. |
|
""" |
|
|
|
import psutil |
|
import time |
|
import logging |
|
from datetime import datetime |
|
from pathlib import Path |
|
|
|
# Setup logging to file with rotation |
|
log_file = Path("/var/log/system_monitor.log") |
|
logging.basicConfig( |
|
level=logging.INFO, |
|
format='%(asctime)s - %(levelname)s - %(message)s', |
|
handlers=[ |
|
logging.FileHandler(log_file), |
|
logging.StreamHandler() # Also print to console |
|
] |
|
) |
|
|
|
def check_ssh_service() -> dict: |
|
""" |
|
Check if SSH service is running. |
|
|
|
Returns: |
|
dict: Service status information |
|
""" |
|
try: |
|
import subprocess |
|
result = subprocess.run( |
|
['systemctl', 'is-active', 'ssh'], |
|
capture_output=True, |
|
text=True, |
|
timeout=5 |
|
) |
|
return { |
|
'running': result.returncode == 0, |
|
'status': result.stdout.strip() |
|
} |
|
except Exception as e: |
|
return {'running': False, 'error': str(e)} |
|
|
|
def get_system_stats() -> dict: |
|
""" |
|
Collect current system statistics. |
|
|
|
Returns: |
|
dict: System statistics including CPU, memory, disk, network |
|
""" |
|
# CPU usage |
|
cpu_percent = psutil.cpu_percent(interval=1) |
|
cpu_count = psutil.cpu_count() |
|
|
|
# Memory usage |
|
memory = psutil.virtual_memory() |
|
swap = psutil.swap_memory() |
|
|
|
# Disk usage |
|
disk = psutil.disk_usage('/') |
|
|
|
# Network stats |
|
net_io = psutil.net_io_counters() |
|
|
|
# System load (1, 5, 15 minute averages) |
|
load_avg = psutil.getloadavg() |
|
|
|
# Number of connections |
|
connections = len(psutil.net_connections()) |
|
|
|
return { |
|
'cpu_percent': cpu_percent, |
|
'cpu_count': cpu_count, |
|
'memory_percent': memory.percent, |
|
'memory_available_gb': memory.available / (1024**3), |
|
'swap_percent': swap.percent, |
|
'disk_percent': disk.percent, |
|
'disk_free_gb': disk.free / (1024**3), |
|
'network_bytes_sent': net_io.bytes_sent, |
|
'network_bytes_recv': net_io.bytes_recv, |
|
'load_1min': load_avg[0], |
|
'load_5min': load_avg[1], |
|
'load_15min': load_avg[2], |
|
'connections': connections |
|
} |
|
|
|
def monitor_loop(interval_seconds: int = 60): |
|
""" |
|
Main monitoring loop that logs system stats at regular intervals. |
|
|
|
Args: |
|
interval_seconds: How often to log stats (default: 60 seconds) |
|
""" |
|
logging.info("Starting system monitoring...") |
|
|
|
while True: |
|
try: |
|
stats = get_system_stats() |
|
ssh_status = check_ssh_service() |
|
|
|
# Log current stats |
|
log_message = ( |
|
f"CPU: {stats['cpu_percent']:.1f}% | " |
|
f"MEM: {stats['memory_percent']:.1f}% ({stats['memory_available_gb']:.2f}GB free) | " |
|
f"DISK: {stats['disk_percent']:.1f}% ({stats['disk_free_gb']:.2f}GB free) | " |
|
f"LOAD: {stats['load_1min']:.2f} {stats['load_5min']:.2f} {stats['load_15min']:.2f} | " |
|
f"CONN: {stats['connections']} | " |
|
f"SSH: {ssh_status.get('status', 'unknown')}" |
|
) |
|
|
|
# Warning thresholds |
|
if stats['cpu_percent'] > 90: |
|
logging.warning(f"HIGH CPU! {log_message}") |
|
elif stats['memory_percent'] > 90: |
|
logging.warning(f"HIGH MEMORY! {log_message}") |
|
elif stats['disk_percent'] > 90: |
|
logging.warning(f"HIGH DISK USAGE! {log_message}") |
|
elif stats['load_1min'] > stats['cpu_count'] * 2: |
|
logging.warning(f"HIGH LOAD! {log_message}") |
|
elif not ssh_status.get('running'): |
|
logging.error(f"SSH SERVICE DOWN! {log_message}") |
|
else: |
|
logging.info(log_message) |
|
|
|
time.sleep(interval_seconds) |
|
|
|
except Exception as e: |
|
logging.error(f"Error in monitoring loop: {e}") |
|
time.sleep(interval_seconds) |
|
|
|
if __name__ == "__main__": |
|
monitor_loop(interval_seconds=60) # Log every 60 seconds |