"""System health service — CPU, RAM, disk, uptime, and 24-hour history. Calls the gateway ``health`` or ``status`` RPC methods and normalises the response into structured health snapshots. A module-level ``HealthHistory`` instance accumulates snapshots per gateway for up to 24 hours. Parser is deliberately defensive: unknown keys are ignored and all numeric fields default to None when absent or non-numeric. """ from __future__ import annotations from datetime import datetime, timedelta from typing import Any from app.core.logging import get_logger from app.core.time import utcnow from app.services.openclaw.gateway_rpc import ( GatewayConfig, OpenClawGatewayError, openclaw_call, ) logger = get_logger(__name__) DEFAULT_HISTORY_WINDOW_HOURS = 24 # --------------------------------------------------------------------------- # HealthSnapshot # --------------------------------------------------------------------------- class HealthSnapshot: """Normalised point-in-time system health reading.""" __slots__ = ( "recorded_at", "cpu_pct", "memory_pct", "memory_used_bytes", "memory_total_bytes", "disk_pct", "disk_used_bytes", "disk_total_bytes", "uptime_seconds", "load_avg_1m", "load_avg_5m", "load_avg_15m", "hostname", "platform", ) def __init__( self, *, recorded_at: datetime, cpu_pct: float | None = None, memory_pct: float | None = None, memory_used_bytes: int | None = None, memory_total_bytes: int | None = None, disk_pct: float | None = None, disk_used_bytes: int | None = None, disk_total_bytes: int | None = None, uptime_seconds: int | None = None, load_avg_1m: float | None = None, load_avg_5m: float | None = None, load_avg_15m: float | None = None, hostname: str | None = None, platform: str | None = None, ) -> None: self.recorded_at = recorded_at self.cpu_pct = cpu_pct self.memory_pct = memory_pct self.memory_used_bytes = memory_used_bytes self.memory_total_bytes = memory_total_bytes self.disk_pct = disk_pct self.disk_used_bytes = disk_used_bytes self.disk_total_bytes = disk_total_bytes self.uptime_seconds = uptime_seconds self.load_avg_1m = load_avg_1m self.load_avg_5m = load_avg_5m self.load_avg_15m = load_avg_15m self.hostname = hostname self.platform = platform def to_dict(self) -> dict[str, Any]: return { "recorded_at": self.recorded_at.isoformat(), "cpu_pct": self.cpu_pct, "memory_pct": self.memory_pct, "memory_used_bytes": self.memory_used_bytes, "memory_total_bytes": self.memory_total_bytes, "disk_pct": self.disk_pct, "disk_used_bytes": self.disk_used_bytes, "disk_total_bytes": self.disk_total_bytes, "uptime_seconds": self.uptime_seconds, "load_avg_1m": self.load_avg_1m, "load_avg_5m": self.load_avg_5m, "load_avg_15m": self.load_avg_15m, "hostname": self.hostname, "platform": self.platform, } # --------------------------------------------------------------------------- # Parser — pure function # --------------------------------------------------------------------------- def _float(d: dict[str, Any], *keys: str) -> float | None: for k in keys: v = d.get(k) if v is not None: try: return float(v) except (TypeError, ValueError): pass return None def _int(d: dict[str, Any], *keys: str) -> int | None: f = _float(d, *keys) return int(f) if f is not None else None def _pct_from_used_total(used: int | None, total: int | None) -> float | None: if used is not None and total and total > 0: return round(used / total * 100, 1) return None def parse_health_response(raw: object) -> HealthSnapshot: """Parse a gateway ``health`` / ``status`` response into a HealthSnapshot. Never raises — returns an empty snapshot on any input. """ now = utcnow() if not isinstance(raw, dict): return HealthSnapshot(recorded_at=now) # CPU — try nested block first, then top-level alt keys cpu_block = raw.get("cpu") if isinstance(cpu_block, dict) and cpu_block: cpu_pct = _float(cpu_block, "usage", "percent", "pct") load_avgs = cpu_block.get("loadAvg") or cpu_block.get("load_avg") load_avg_1m = float(load_avgs[0]) if isinstance(load_avgs, list) and load_avgs else None load_avg_5m = float(load_avgs[1]) if isinstance(load_avgs, list) and len(load_avgs) > 1 else None load_avg_15m = float(load_avgs[2]) if isinstance(load_avgs, list) and len(load_avgs) > 2 else None else: cpu_pct = _float(raw, "cpuUsage", "cpu_usage", "cpu_pct", "cpu_percent") load_avg_1m = load_avg_5m = load_avg_15m = None # Memory — try nested block first, then top-level alt keys mem_block = raw.get("memory") or raw.get("mem") if isinstance(mem_block, dict) and mem_block: mem_used = _int(mem_block, "used", "rss", "heapUsed") mem_total = _int(mem_block, "total", "heapTotal") mem_pct = _float(mem_block, "percent", "pct", "usage") or _pct_from_used_total(mem_used, mem_total) else: mem_used = _int(raw, "memUsed", "mem_used", "memory_used") mem_total = _int(raw, "memTotal", "mem_total", "memory_total") mem_pct = _pct_from_used_total(mem_used, mem_total) # Disk — try nested block first, then top-level alt keys disk_block = raw.get("disk") or raw.get("storage") if isinstance(disk_block, dict) and disk_block: disk_used = _int(disk_block, "used") disk_total = _int(disk_block, "total") disk_pct = _float(disk_block, "percent", "pct", "usage") or _pct_from_used_total(disk_used, disk_total) else: disk_used = _int(raw, "diskUsed", "disk_used") disk_total = _int(raw, "diskTotal", "disk_total") disk_pct = _pct_from_used_total(disk_used, disk_total) # Uptime uptime = _int(raw, "uptime", "uptimeSeconds", "uptime_seconds") # Hostname / platform hostname = raw.get("hostname") or raw.get("host") platform = raw.get("platform") or raw.get("os") return HealthSnapshot( recorded_at=now, cpu_pct=cpu_pct, memory_pct=mem_pct, memory_used_bytes=mem_used, memory_total_bytes=mem_total, disk_pct=disk_pct, disk_used_bytes=disk_used, disk_total_bytes=disk_total, uptime_seconds=uptime, load_avg_1m=load_avg_1m, load_avg_5m=load_avg_5m, load_avg_15m=load_avg_15m, hostname=str(hostname) if hostname else None, platform=str(platform) if platform else None, ) # --------------------------------------------------------------------------- # Rolling history # --------------------------------------------------------------------------- class HealthHistory: """In-memory rolling window of HealthSnapshots per gateway ID.""" def __init__(self, window_hours: int = DEFAULT_HISTORY_WINDOW_HOURS) -> None: self.window_hours = window_hours self._data: dict[str, list[HealthSnapshot]] = {} def add(self, gateway_id: str, snapshot: HealthSnapshot) -> None: """Append a snapshot and prune entries outside the window.""" bucket = self._data.setdefault(gateway_id, []) bucket.append(snapshot) cutoff = utcnow() - timedelta(hours=self.window_hours) self._data[gateway_id] = sorted( [s for s in bucket if s.recorded_at >= cutoff], key=lambda s: s.recorded_at, ) def get(self, gateway_id: str) -> list[HealthSnapshot]: """Return snapshots oldest-first for the given gateway.""" return list(self._data.get(gateway_id, [])) def latest(self, gateway_id: str) -> HealthSnapshot | None: """Return the most recent snapshot for the given gateway.""" snaps = self._data.get(gateway_id) return snaps[-1] if snaps else None # --------------------------------------------------------------------------- # Gateway fetch # --------------------------------------------------------------------------- # Module-level singleton — shared across requests _history = HealthHistory() async def fetch_health( gateway_id: str, config: GatewayConfig, *, record: bool = True, ) -> HealthSnapshot: """Fetch current health from the gateway and optionally store in history. Returns an empty snapshot on any failure. """ raw: dict[str, Any] = {} for method in ("health", "status"): try: result = await openclaw_call(method, config=config) if isinstance(result, dict) and result: raw = result break except (OpenClawGatewayError, TimeoutError, OSError, RuntimeError) as exc: logger.debug("system_health.fetch_failed method=%s error=%s", method, exc) except Exception as exc: logger.warning("system_health.fetch_unexpected method=%s error=%s", method, exc) snapshot = parse_health_response(raw) if record: _history.add(gateway_id, snapshot) return snapshot def get_history(gateway_id: str) -> list[HealthSnapshot]: """Return the 24-hour history for a gateway (oldest-first).""" return _history.get(gateway_id)