Pipeline/backend/app/services/openclaw/system_health.py

"""System health service — CPU, RAM, disk, uptime, and 24-hour history.

Calls the gateway ``health`` or ``status`` RPC methods and normalises
the response into structured health snapshots.  A module-level
``HealthHistory`` instance accumulates snapshots per gateway for up to
24 hours.

Parser is deliberately defensive: unknown keys are ignored and all
numeric fields default to None when absent or non-numeric.
"""

from __future__ import annotations

from datetime import datetime, timedelta
from typing import Any

from app.core.logging import get_logger
from app.core.time import utcnow
from app.services.openclaw.gateway_rpc import (
    GatewayConfig,
    OpenClawGatewayError,
    openclaw_call,
)

logger = get_logger(__name__)

DEFAULT_HISTORY_WINDOW_HOURS = 24


# ---------------------------------------------------------------------------
# HealthSnapshot
# ---------------------------------------------------------------------------

class HealthSnapshot:
    """Normalised point-in-time system health reading."""

    __slots__ = (
        "recorded_at",
        "cpu_pct",
        "memory_pct",
        "memory_used_bytes",
        "memory_total_bytes",
        "disk_pct",
        "disk_used_bytes",
        "disk_total_bytes",
        "uptime_seconds",
        "load_avg_1m",
        "load_avg_5m",
        "load_avg_15m",
        "hostname",
        "platform",
    )

    def __init__(
        self,
        *,
        recorded_at: datetime,
        cpu_pct: float | None = None,
        memory_pct: float | None = None,
        memory_used_bytes: int | None = None,
        memory_total_bytes: int | None = None,
        disk_pct: float | None = None,
        disk_used_bytes: int | None = None,
        disk_total_bytes: int | None = None,
        uptime_seconds: int | None = None,
        load_avg_1m: float | None = None,
        load_avg_5m: float | None = None,
        load_avg_15m: float | None = None,
        hostname: str | None = None,
        platform: str | None = None,
    ) -> None:
        self.recorded_at = recorded_at
        self.cpu_pct = cpu_pct
        self.memory_pct = memory_pct
        self.memory_used_bytes = memory_used_bytes
        self.memory_total_bytes = memory_total_bytes
        self.disk_pct = disk_pct
        self.disk_used_bytes = disk_used_bytes
        self.disk_total_bytes = disk_total_bytes
        self.uptime_seconds = uptime_seconds
        self.load_avg_1m = load_avg_1m
        self.load_avg_5m = load_avg_5m
        self.load_avg_15m = load_avg_15m
        self.hostname = hostname
        self.platform = platform

    def to_dict(self) -> dict[str, Any]:
        return {
            "recorded_at": self.recorded_at.isoformat(),
            "cpu_pct": self.cpu_pct,
            "memory_pct": self.memory_pct,
            "memory_used_bytes": self.memory_used_bytes,
            "memory_total_bytes": self.memory_total_bytes,
            "disk_pct": self.disk_pct,
            "disk_used_bytes": self.disk_used_bytes,
            "disk_total_bytes": self.disk_total_bytes,
            "uptime_seconds": self.uptime_seconds,
            "load_avg_1m": self.load_avg_1m,
            "load_avg_5m": self.load_avg_5m,
            "load_avg_15m": self.load_avg_15m,
            "hostname": self.hostname,
            "platform": self.platform,
        }


# ---------------------------------------------------------------------------
# Parser — pure function
# ---------------------------------------------------------------------------

def _float(d: dict[str, Any], *keys: str) -> float | None:
    for k in keys:
        v = d.get(k)
        if v is not None:
            try:
                return float(v)
            except (TypeError, ValueError):
                pass
    return None


def _int(d: dict[str, Any], *keys: str) -> int | None:
    f = _float(d, *keys)
    return int(f) if f is not None else None


def _pct_from_used_total(used: int | None, total: int | None) -> float | None:
    if used is not None and total and total > 0:
        return round(used / total * 100, 1)
    return None


def parse_health_response(raw: object) -> HealthSnapshot:
    """Parse a gateway ``health`` / ``status`` response into a HealthSnapshot.

    Never raises — returns an empty snapshot on any input.
    """
    now = utcnow()
    if not isinstance(raw, dict):
        return HealthSnapshot(recorded_at=now)

    # CPU — try nested block first, then top-level alt keys
    cpu_block = raw.get("cpu")
    if isinstance(cpu_block, dict) and cpu_block:
        cpu_pct = _float(cpu_block, "usage", "percent", "pct")
        load_avgs = cpu_block.get("loadAvg") or cpu_block.get("load_avg")
        load_avg_1m = float(load_avgs[0]) if isinstance(load_avgs, list) and load_avgs else None
        load_avg_5m = float(load_avgs[1]) if isinstance(load_avgs, list) and len(load_avgs) > 1 else None
        load_avg_15m = float(load_avgs[2]) if isinstance(load_avgs, list) and len(load_avgs) > 2 else None
    else:
        cpu_pct = _float(raw, "cpuUsage", "cpu_usage", "cpu_pct", "cpu_percent")
        load_avg_1m = load_avg_5m = load_avg_15m = None

    # Memory — try nested block first, then top-level alt keys
    mem_block = raw.get("memory") or raw.get("mem")
    if isinstance(mem_block, dict) and mem_block:
        mem_used = _int(mem_block, "used", "rss", "heapUsed")
        mem_total = _int(mem_block, "total", "heapTotal")
        mem_pct = _float(mem_block, "percent", "pct", "usage") or _pct_from_used_total(mem_used, mem_total)
    else:
        mem_used = _int(raw, "memUsed", "mem_used", "memory_used")
        mem_total = _int(raw, "memTotal", "mem_total", "memory_total")
        mem_pct = _pct_from_used_total(mem_used, mem_total)

    # Disk — try nested block first, then top-level alt keys
    disk_block = raw.get("disk") or raw.get("storage")
    if isinstance(disk_block, dict) and disk_block:
        disk_used = _int(disk_block, "used")
        disk_total = _int(disk_block, "total")
        disk_pct = _float(disk_block, "percent", "pct", "usage") or _pct_from_used_total(disk_used, disk_total)
    else:
        disk_used = _int(raw, "diskUsed", "disk_used")
        disk_total = _int(raw, "diskTotal", "disk_total")
        disk_pct = _pct_from_used_total(disk_used, disk_total)

    # Uptime
    uptime = _int(raw, "uptime", "uptimeSeconds", "uptime_seconds")

    # Hostname / platform
    hostname = raw.get("hostname") or raw.get("host")
    platform = raw.get("platform") or raw.get("os")

    return HealthSnapshot(
        recorded_at=now,
        cpu_pct=cpu_pct,
        memory_pct=mem_pct,
        memory_used_bytes=mem_used,
        memory_total_bytes=mem_total,
        disk_pct=disk_pct,
        disk_used_bytes=disk_used,
        disk_total_bytes=disk_total,
        uptime_seconds=uptime,
        load_avg_1m=load_avg_1m,
        load_avg_5m=load_avg_5m,
        load_avg_15m=load_avg_15m,
        hostname=str(hostname) if hostname else None,
        platform=str(platform) if platform else None,
    )


# ---------------------------------------------------------------------------
# Rolling history
# ---------------------------------------------------------------------------

class HealthHistory:
    """In-memory rolling window of HealthSnapshots per gateway ID."""

    def __init__(self, window_hours: int = DEFAULT_HISTORY_WINDOW_HOURS) -> None:
        self.window_hours = window_hours
        self._data: dict[str, list[HealthSnapshot]] = {}

    def add(self, gateway_id: str, snapshot: HealthSnapshot) -> None:
        """Append a snapshot and prune entries outside the window."""
        bucket = self._data.setdefault(gateway_id, [])
        bucket.append(snapshot)
        cutoff = utcnow() - timedelta(hours=self.window_hours)
        self._data[gateway_id] = sorted(
            [s for s in bucket if s.recorded_at >= cutoff],
            key=lambda s: s.recorded_at,
        )

    def get(self, gateway_id: str) -> list[HealthSnapshot]:
        """Return snapshots oldest-first for the given gateway."""
        return list(self._data.get(gateway_id, []))

    def latest(self, gateway_id: str) -> HealthSnapshot | None:
        """Return the most recent snapshot for the given gateway."""
        snaps = self._data.get(gateway_id)
        return snaps[-1] if snaps else None


# ---------------------------------------------------------------------------
# Gateway fetch
# ---------------------------------------------------------------------------

# Module-level singleton — shared across requests
_history = HealthHistory()


async def fetch_health(
    gateway_id: str,
    config: GatewayConfig,
    *,
    record: bool = True,
) -> HealthSnapshot:
    """Fetch current health from the gateway and optionally store in history.

    Returns an empty snapshot on any failure.
    """
    raw: dict[str, Any] = {}
    for method in ("health", "status"):
        try:
            result = await openclaw_call(method, config=config)
            if isinstance(result, dict) and result:
                raw = result
                break
        except (OpenClawGatewayError, TimeoutError, OSError, RuntimeError) as exc:
            logger.debug("system_health.fetch_failed method=%s error=%s", method, exc)
        except Exception as exc:
            logger.warning("system_health.fetch_unexpected method=%s error=%s", method, exc)

    snapshot = parse_health_response(raw)
    if record:
        _history.add(gateway_id, snapshot)
    return snapshot


def get_history(gateway_id: str) -> list[HealthSnapshot]:
    """Return the 24-hour history for a gateway (oldest-first)."""
    return _history.get(gateway_id)