feat(gateway-ops): memory, cron, and health panels (batch 5, #34)

This commit is contained in:
null 2026-05-20 21:32:46 -05:00
parent 16b23eef2e
commit 4e40323e71
16 changed files with 2404 additions and 1 deletions

View File

@ -31,11 +31,26 @@ from app.schemas.gateways import (
GatewayTemplatesSyncResult,
GatewayUpdate,
)
from app.schemas.gateway_ops import (
CronJobRead,
CronStatusResponse,
HealthSnapshotRead,
SystemHealthResponse,
)
from app.schemas.runtime_usage import (
ProviderUsageResponse,
ProviderUsageScrapeResult,
RuntimeUsageResponse,
)
from app.services.openclaw.cron_status import (
compute_job_status,
fetch_cron_jobs,
)
from app.services.openclaw.system_health import (
DEFAULT_HISTORY_WINDOW_HOURS,
fetch_health,
get_history,
)
from app.services.openclaw.runtime_activity import (
HISTORY_FETCH_LIMIT,
POLL_HISTORY_SESSIONS_MAX,
@ -360,6 +375,106 @@ async def get_gateway_provider_usage(
)
@router.get(
"/{gateway_id}/cron",
response_model=CronStatusResponse,
summary="Gateway cron job status",
description="Return the list of cron jobs configured on the gateway with their last-run status.",
)
async def get_gateway_cron(
gateway_id: UUID,
session: AsyncSession = SESSION_DEP,
ctx: OrganizationContext = ORG_MEMBER_DEP,
) -> CronStatusResponse:
"""Read cron job status from the gateway (read-only)."""
from app.services.openclaw.gateway_rpc import GatewayConfig as GatewayClientConfig
service = GatewayAdminLifecycleService(session)
gateway = await service.require_gateway(
gateway_id=gateway_id,
organization_id=ctx.organization.id,
)
config = GatewayClientConfig(
url=gateway.url,
token=gateway.token,
allow_insecure_tls=gateway.allow_insecure_tls,
disable_device_pairing=gateway.disable_device_pairing,
)
jobs = await fetch_cron_jobs(config)
return CronStatusResponse(
gateway_id=gateway_id,
generated_at=utcnow(),
jobs=[
CronJobRead(
name=j.name,
schedule=j.schedule,
enabled=j.enabled,
status=compute_job_status(j),
last_run=j.last_run,
next_run=j.next_run,
last_duration_ms=j.last_duration_ms,
last_error=j.last_error,
)
for j in jobs
],
)
@router.get(
"/{gateway_id}/health",
response_model=SystemHealthResponse,
summary="Gateway system health",
description="Return current CPU, RAM, disk, and uptime stats plus a 24-hour history.",
)
async def get_gateway_health(
gateway_id: UUID,
session: AsyncSession = SESSION_DEP,
ctx: OrganizationContext = ORG_MEMBER_DEP,
) -> SystemHealthResponse:
"""Read system health from the gateway and append to the rolling history."""
from app.services.openclaw.gateway_rpc import GatewayConfig as GatewayClientConfig
service = GatewayAdminLifecycleService(session)
gateway = await service.require_gateway(
gateway_id=gateway_id,
organization_id=ctx.organization.id,
)
config = GatewayClientConfig(
url=gateway.url,
token=gateway.token,
allow_insecure_tls=gateway.allow_insecure_tls,
disable_device_pairing=gateway.disable_device_pairing,
)
snapshot = await fetch_health(str(gateway_id), config, record=True)
history = get_history(str(gateway_id))
def _snap_read(s) -> HealthSnapshotRead:
return HealthSnapshotRead(
recorded_at=s.recorded_at,
cpu_pct=s.cpu_pct,
memory_pct=s.memory_pct,
memory_used_bytes=s.memory_used_bytes,
memory_total_bytes=s.memory_total_bytes,
disk_pct=s.disk_pct,
disk_used_bytes=s.disk_used_bytes,
disk_total_bytes=s.disk_total_bytes,
uptime_seconds=s.uptime_seconds,
load_avg_1m=s.load_avg_1m,
load_avg_5m=s.load_avg_5m,
load_avg_15m=s.load_avg_15m,
hostname=s.hostname,
platform=s.platform,
)
return SystemHealthResponse(
gateway_id=gateway_id,
generated_at=utcnow(),
current=_snap_read(snapshot),
history=[_snap_read(s) for s in history],
history_window_hours=DEFAULT_HISTORY_WINDOW_HOURS,
)
@router.get(
"/{gateway_id}/runtime-activity",
summary="Recent gateway runtime messages (REST snapshot)",

View File

@ -0,0 +1,73 @@
"""Response schemas for gateway operational visibility endpoints.
Covers: cron job status, system health snapshots, and health history.
All fields are optional partial data is expected when the gateway
returns unexpected formats or a field is simply absent.
"""
from __future__ import annotations
from datetime import datetime
from uuid import UUID
from sqlmodel import SQLModel
RUNTIME_ANNOTATION_TYPES = (datetime, UUID)
# ---------------------------------------------------------------------------
# Cron
# ---------------------------------------------------------------------------
class CronJobRead(SQLModel):
"""One cron job entry returned by GET /gateways/{id}/cron."""
name: str
schedule: str = ""
enabled: bool = True
status: str # "ok" | "error" | "running" | "pending" | "disabled" | "unknown"
last_run: str | None = None
next_run: str | None = None
last_duration_ms: int | None = None
last_error: str | None = None
class CronStatusResponse(SQLModel):
"""Response for GET /gateways/{id}/cron."""
gateway_id: UUID
generated_at: datetime
jobs: list[CronJobRead]
# ---------------------------------------------------------------------------
# Health
# ---------------------------------------------------------------------------
class HealthSnapshotRead(SQLModel):
"""One system health reading."""
recorded_at: datetime
cpu_pct: float | None = None
memory_pct: float | None = None
memory_used_bytes: int | None = None
memory_total_bytes: int | None = None
disk_pct: float | None = None
disk_used_bytes: int | None = None
disk_total_bytes: int | None = None
uptime_seconds: int | None = None
load_avg_1m: float | None = None
load_avg_5m: float | None = None
load_avg_15m: float | None = None
hostname: str | None = None
platform: str | None = None
class SystemHealthResponse(SQLModel):
"""Response for GET /gateways/{id}/health."""
gateway_id: UUID
generated_at: datetime
current: HealthSnapshotRead
history: list[HealthSnapshotRead] # oldest-first, last 24 hours
history_window_hours: int = 24

View File

@ -0,0 +1,169 @@
"""Cron job status service — read OpenClaw cron data from the gateway.
Calls ``cron.list`` and ``cron.status`` RPC methods and normalises the results.
This is read-only; cron toggle/run endpoints are out of scope for now.
Parser is deliberately defensive:
- Unknown keys are ignored (schema drift doesn't break anything).
- Missing name job entry skipped entirely.
- All other missing fields default to None.
"""
from __future__ import annotations
from dataclasses import dataclass, field
from typing import Any
from app.core.logging import get_logger
from app.services.openclaw.gateway_rpc import (
GatewayConfig,
OpenClawGatewayError,
openclaw_call,
)
logger = get_logger(__name__)
# ---------------------------------------------------------------------------
# Internal data type
# ---------------------------------------------------------------------------
@dataclass
class CronJob:
"""Normalised representation of one cron job entry."""
name: str
schedule: str = ""
enabled: bool = True
last_run: str | None = None # ISO-8601 string or None
next_run: str | None = None # ISO-8601 string or None
last_duration_ms: int | None = None
last_status: str | None = None # "success", "error", "running", None
last_error: str | None = None
# ---------------------------------------------------------------------------
# Status helper
# ---------------------------------------------------------------------------
_STATUS_MAP = {
"success": "ok",
"ok": "ok",
"done": "ok",
"error": "error",
"failed": "error",
"fail": "error",
"running": "running",
"active": "running",
}
def compute_job_status(job: CronJob) -> str:
"""Return a normalised status string for display.
Returns: ``"disabled"`` | ``"pending"`` | ``"ok"`` | ``"error"`` |
``"running"`` | ``"unknown"``
"""
if not job.enabled:
return "disabled"
if job.last_run is None:
return "pending"
raw = (job.last_status or "").lower().strip()
return _STATUS_MAP.get(raw, "unknown")
# ---------------------------------------------------------------------------
# Parser — pure function
# ---------------------------------------------------------------------------
def _get_str(d: dict[str, Any], *keys: str) -> str | None:
for k in keys:
v = d.get(k)
if v is not None and str(v).strip():
return str(v).strip()
return None
def _get_int(d: dict[str, Any], *keys: str) -> int | None:
for k in keys:
v = d.get(k)
if v is not None:
try:
return int(float(v))
except (TypeError, ValueError):
pass
return None
def _get_bool(d: dict[str, Any], *keys: str, default: bool = True) -> bool:
for k in keys:
v = d.get(k)
if v is None:
continue
if isinstance(v, bool):
return v
if isinstance(v, (int, float)):
return bool(v)
if isinstance(v, str):
return v.lower() not in {"false", "0", "no", "off", "disabled"}
return default
def parse_cron_jobs(raw: object) -> list[CronJob]:
"""Parse a ``cron.list`` response into a list of CronJob objects.
Handles both camelCase and snake_case key variants.
Entries without a recognisable name are silently dropped.
Non-list input returns an empty list.
"""
if not isinstance(raw, list):
return []
jobs: list[CronJob] = []
for item in raw:
if not isinstance(item, dict):
continue
name = _get_str(item, "name", "title", "id", "key")
if not name:
continue
job = CronJob(
name=name,
schedule=_get_str(item, "schedule", "cron", "expression") or "",
enabled=_get_bool(item, "enabled", "active", "isEnabled"),
last_run=_get_str(item, "lastRun", "last_run", "lastRunAt", "ran_at"),
next_run=_get_str(item, "nextRun", "next_run", "nextRunAt", "next_at"),
last_duration_ms=_get_int(
item, "lastDuration", "last_duration", "duration_ms", "durationMs"
),
last_status=_get_str(
item, "lastStatus", "last_status", "result", "status"
),
last_error=_get_str(item, "lastError", "last_error", "error"),
)
jobs.append(job)
return jobs
# ---------------------------------------------------------------------------
# Gateway fetch
# ---------------------------------------------------------------------------
async def fetch_cron_jobs(config: GatewayConfig) -> list[CronJob]:
"""Fetch and parse cron jobs from the gateway.
Returns an empty list if the gateway does not support cron,
the file is missing, or any error occurs.
"""
try:
raw = await openclaw_call("cron.list", config=config)
jobs = parse_cron_jobs(raw)
logger.debug("cron_status.fetched count=%d", len(jobs))
return jobs
except (OpenClawGatewayError, TimeoutError, OSError, RuntimeError) as exc:
logger.debug("cron_status.fetch_failed error=%s", exc)
return []
except Exception as exc:
logger.warning("cron_status.fetch_unexpected error=%s", exc)
return []

View File

@ -0,0 +1,269 @@
"""System health service — CPU, RAM, disk, uptime, and 24-hour history.
Calls the gateway ``health`` or ``status`` RPC methods and normalises
the response into structured health snapshots. A module-level
``HealthHistory`` instance accumulates snapshots per gateway for up to
24 hours.
Parser is deliberately defensive: unknown keys are ignored and all
numeric fields default to None when absent or non-numeric.
"""
from __future__ import annotations
from datetime import datetime, timedelta
from typing import Any
from app.core.logging import get_logger
from app.core.time import utcnow
from app.services.openclaw.gateway_rpc import (
GatewayConfig,
OpenClawGatewayError,
openclaw_call,
)
logger = get_logger(__name__)
DEFAULT_HISTORY_WINDOW_HOURS = 24
# ---------------------------------------------------------------------------
# HealthSnapshot
# ---------------------------------------------------------------------------
class HealthSnapshot:
"""Normalised point-in-time system health reading."""
__slots__ = (
"recorded_at",
"cpu_pct",
"memory_pct",
"memory_used_bytes",
"memory_total_bytes",
"disk_pct",
"disk_used_bytes",
"disk_total_bytes",
"uptime_seconds",
"load_avg_1m",
"load_avg_5m",
"load_avg_15m",
"hostname",
"platform",
)
def __init__(
self,
*,
recorded_at: datetime,
cpu_pct: float | None = None,
memory_pct: float | None = None,
memory_used_bytes: int | None = None,
memory_total_bytes: int | None = None,
disk_pct: float | None = None,
disk_used_bytes: int | None = None,
disk_total_bytes: int | None = None,
uptime_seconds: int | None = None,
load_avg_1m: float | None = None,
load_avg_5m: float | None = None,
load_avg_15m: float | None = None,
hostname: str | None = None,
platform: str | None = None,
) -> None:
self.recorded_at = recorded_at
self.cpu_pct = cpu_pct
self.memory_pct = memory_pct
self.memory_used_bytes = memory_used_bytes
self.memory_total_bytes = memory_total_bytes
self.disk_pct = disk_pct
self.disk_used_bytes = disk_used_bytes
self.disk_total_bytes = disk_total_bytes
self.uptime_seconds = uptime_seconds
self.load_avg_1m = load_avg_1m
self.load_avg_5m = load_avg_5m
self.load_avg_15m = load_avg_15m
self.hostname = hostname
self.platform = platform
def to_dict(self) -> dict[str, Any]:
return {
"recorded_at": self.recorded_at.isoformat(),
"cpu_pct": self.cpu_pct,
"memory_pct": self.memory_pct,
"memory_used_bytes": self.memory_used_bytes,
"memory_total_bytes": self.memory_total_bytes,
"disk_pct": self.disk_pct,
"disk_used_bytes": self.disk_used_bytes,
"disk_total_bytes": self.disk_total_bytes,
"uptime_seconds": self.uptime_seconds,
"load_avg_1m": self.load_avg_1m,
"load_avg_5m": self.load_avg_5m,
"load_avg_15m": self.load_avg_15m,
"hostname": self.hostname,
"platform": self.platform,
}
# ---------------------------------------------------------------------------
# Parser — pure function
# ---------------------------------------------------------------------------
def _float(d: dict[str, Any], *keys: str) -> float | None:
for k in keys:
v = d.get(k)
if v is not None:
try:
return float(v)
except (TypeError, ValueError):
pass
return None
def _int(d: dict[str, Any], *keys: str) -> int | None:
f = _float(d, *keys)
return int(f) if f is not None else None
def _pct_from_used_total(used: int | None, total: int | None) -> float | None:
if used is not None and total and total > 0:
return round(used / total * 100, 1)
return None
def parse_health_response(raw: object) -> HealthSnapshot:
"""Parse a gateway ``health`` / ``status`` response into a HealthSnapshot.
Never raises returns an empty snapshot on any input.
"""
now = utcnow()
if not isinstance(raw, dict):
return HealthSnapshot(recorded_at=now)
# CPU — try nested block first, then top-level alt keys
cpu_block = raw.get("cpu")
if isinstance(cpu_block, dict) and cpu_block:
cpu_pct = _float(cpu_block, "usage", "percent", "pct")
load_avgs = cpu_block.get("loadAvg") or cpu_block.get("load_avg")
load_avg_1m = float(load_avgs[0]) if isinstance(load_avgs, list) and load_avgs else None
load_avg_5m = float(load_avgs[1]) if isinstance(load_avgs, list) and len(load_avgs) > 1 else None
load_avg_15m = float(load_avgs[2]) if isinstance(load_avgs, list) and len(load_avgs) > 2 else None
else:
cpu_pct = _float(raw, "cpuUsage", "cpu_usage", "cpu_pct", "cpu_percent")
load_avg_1m = load_avg_5m = load_avg_15m = None
# Memory — try nested block first, then top-level alt keys
mem_block = raw.get("memory") or raw.get("mem")
if isinstance(mem_block, dict) and mem_block:
mem_used = _int(mem_block, "used", "rss", "heapUsed")
mem_total = _int(mem_block, "total", "heapTotal")
mem_pct = _float(mem_block, "percent", "pct", "usage") or _pct_from_used_total(mem_used, mem_total)
else:
mem_used = _int(raw, "memUsed", "mem_used", "memory_used")
mem_total = _int(raw, "memTotal", "mem_total", "memory_total")
mem_pct = _pct_from_used_total(mem_used, mem_total)
# Disk — try nested block first, then top-level alt keys
disk_block = raw.get("disk") or raw.get("storage")
if isinstance(disk_block, dict) and disk_block:
disk_used = _int(disk_block, "used")
disk_total = _int(disk_block, "total")
disk_pct = _float(disk_block, "percent", "pct", "usage") or _pct_from_used_total(disk_used, disk_total)
else:
disk_used = _int(raw, "diskUsed", "disk_used")
disk_total = _int(raw, "diskTotal", "disk_total")
disk_pct = _pct_from_used_total(disk_used, disk_total)
# Uptime
uptime = _int(raw, "uptime", "uptimeSeconds", "uptime_seconds")
# Hostname / platform
hostname = raw.get("hostname") or raw.get("host")
platform = raw.get("platform") or raw.get("os")
return HealthSnapshot(
recorded_at=now,
cpu_pct=cpu_pct,
memory_pct=mem_pct,
memory_used_bytes=mem_used,
memory_total_bytes=mem_total,
disk_pct=disk_pct,
disk_used_bytes=disk_used,
disk_total_bytes=disk_total,
uptime_seconds=uptime,
load_avg_1m=load_avg_1m,
load_avg_5m=load_avg_5m,
load_avg_15m=load_avg_15m,
hostname=str(hostname) if hostname else None,
platform=str(platform) if platform else None,
)
# ---------------------------------------------------------------------------
# Rolling history
# ---------------------------------------------------------------------------
class HealthHistory:
"""In-memory rolling window of HealthSnapshots per gateway ID."""
def __init__(self, window_hours: int = DEFAULT_HISTORY_WINDOW_HOURS) -> None:
self.window_hours = window_hours
self._data: dict[str, list[HealthSnapshot]] = {}
def add(self, gateway_id: str, snapshot: HealthSnapshot) -> None:
"""Append a snapshot and prune entries outside the window."""
bucket = self._data.setdefault(gateway_id, [])
bucket.append(snapshot)
cutoff = utcnow() - timedelta(hours=self.window_hours)
self._data[gateway_id] = sorted(
[s for s in bucket if s.recorded_at >= cutoff],
key=lambda s: s.recorded_at,
)
def get(self, gateway_id: str) -> list[HealthSnapshot]:
"""Return snapshots oldest-first for the given gateway."""
return list(self._data.get(gateway_id, []))
def latest(self, gateway_id: str) -> HealthSnapshot | None:
"""Return the most recent snapshot for the given gateway."""
snaps = self._data.get(gateway_id)
return snaps[-1] if snaps else None
# ---------------------------------------------------------------------------
# Gateway fetch
# ---------------------------------------------------------------------------
# Module-level singleton — shared across requests
_history = HealthHistory()
async def fetch_health(
gateway_id: str,
config: GatewayConfig,
*,
record: bool = True,
) -> HealthSnapshot:
"""Fetch current health from the gateway and optionally store in history.
Returns an empty snapshot on any failure.
"""
raw: dict[str, Any] = {}
for method in ("health", "status"):
try:
result = await openclaw_call(method, config=config)
if isinstance(result, dict) and result:
raw = result
break
except (OpenClawGatewayError, TimeoutError, OSError, RuntimeError) as exc:
logger.debug("system_health.fetch_failed method=%s error=%s", method, exc)
except Exception as exc:
logger.warning("system_health.fetch_unexpected method=%s error=%s", method, exc)
snapshot = parse_health_response(raw)
if record:
_history.add(gateway_id, snapshot)
return snapshot
def get_history(gateway_id: str) -> list[HealthSnapshot]:
"""Return the 24-hour history for a gateway (oldest-first)."""
return _history.get(gateway_id)

View File

@ -0,0 +1,178 @@
# ruff: noqa: INP001
"""Unit tests for cron_status service helpers.
All tests are pure-Python no gateway connection required.
"""
from __future__ import annotations
import pytest
from app.services.openclaw.cron_status import (
CronJob,
parse_cron_jobs,
compute_job_status,
)
# ---------------------------------------------------------------------------
# Fixtures
# ---------------------------------------------------------------------------
FIXTURE_NORMAL = [
{
"name": "sync-agents",
"schedule": "*/5 * * * *",
"enabled": True,
"lastRun": "2026-05-21T10:00:00Z",
"nextRun": "2026-05-21T10:05:00Z",
"lastDuration": 1234,
"lastStatus": "success",
},
{
"name": "cleanup-sessions",
"schedule": "0 3 * * *",
"enabled": False,
"lastRun": None,
"nextRun": None,
"lastDuration": None,
"lastStatus": None,
},
]
FIXTURE_FAILED_JOB = [
{
"name": "broken-task",
"schedule": "*/10 * * * *",
"enabled": True,
"lastRun": "2026-05-21T09:50:00Z",
"nextRun": "2026-05-21T10:00:00Z",
"lastDuration": 5000,
"lastStatus": "error",
"lastError": "connection refused",
},
]
FIXTURE_ALT_KEYS = [
{
"id": "job-1",
"title": "my-job",
"active": True,
"last_run": "2026-05-21T08:00:00Z",
"next_run": "2026-05-21T09:00:00Z",
"duration_ms": 800,
"result": "ok",
}
]
FIXTURE_EMPTY = []
FIXTURE_GARBAGE = "not a list"
FIXTURE_NONE = None
# ---------------------------------------------------------------------------
# parse_cron_jobs
# ---------------------------------------------------------------------------
class TestParseCronJobs:
def test_normal_jobs_parsed(self):
jobs = parse_cron_jobs(FIXTURE_NORMAL)
assert len(jobs) == 2
def test_job_name_extracted(self):
jobs = parse_cron_jobs(FIXTURE_NORMAL)
assert jobs[0].name == "sync-agents"
def test_schedule_extracted(self):
jobs = parse_cron_jobs(FIXTURE_NORMAL)
assert jobs[0].schedule == "*/5 * * * *"
def test_enabled_flag(self):
jobs = parse_cron_jobs(FIXTURE_NORMAL)
assert jobs[0].enabled is True
assert jobs[1].enabled is False
def test_last_run_parsed(self):
jobs = parse_cron_jobs(FIXTURE_NORMAL)
assert jobs[0].last_run is not None
assert "2026" in jobs[0].last_run
def test_last_run_none_when_absent(self):
jobs = parse_cron_jobs(FIXTURE_NORMAL)
assert jobs[1].last_run is None
def test_duration_ms(self):
jobs = parse_cron_jobs(FIXTURE_NORMAL)
assert jobs[0].last_duration_ms == 1234
def test_status_success(self):
jobs = parse_cron_jobs(FIXTURE_NORMAL)
assert jobs[0].last_status == "success"
def test_status_none_when_never_run(self):
jobs = parse_cron_jobs(FIXTURE_NORMAL)
assert jobs[1].last_status is None
def test_failed_job(self):
jobs = parse_cron_jobs(FIXTURE_FAILED_JOB)
assert jobs[0].last_status == "error"
assert jobs[0].last_error is not None
def test_alt_key_names(self):
jobs = parse_cron_jobs(FIXTURE_ALT_KEYS)
assert len(jobs) == 1
assert jobs[0].name == "my-job"
assert jobs[0].enabled is True
assert jobs[0].last_duration_ms == 800
def test_empty_list(self):
assert parse_cron_jobs(FIXTURE_EMPTY) == []
def test_garbage_input_returns_empty(self):
assert parse_cron_jobs(FIXTURE_GARBAGE) == []
def test_none_input_returns_empty(self):
assert parse_cron_jobs(FIXTURE_NONE) == []
def test_non_dict_entries_skipped(self):
jobs = parse_cron_jobs([FIXTURE_NORMAL[0], "bad", 42, None])
assert len(jobs) == 1
def test_missing_name_skipped(self):
jobs = parse_cron_jobs([{"schedule": "* * * * *", "enabled": True}])
assert len(jobs) == 0
def test_returns_cron_job_instances(self):
jobs = parse_cron_jobs(FIXTURE_NORMAL)
assert all(isinstance(j, CronJob) for j in jobs)
# ---------------------------------------------------------------------------
# compute_job_status
# ---------------------------------------------------------------------------
class TestComputeJobStatus:
def test_disabled_returns_disabled(self):
job = CronJob(name="j", schedule="* * *", enabled=False)
assert compute_job_status(job) == "disabled"
def test_never_run_returns_pending(self):
job = CronJob(name="j", schedule="* * *", enabled=True, last_run=None)
assert compute_job_status(job) == "pending"
def test_success_returns_ok(self):
job = CronJob(name="j", schedule="* * *", enabled=True,
last_run="2026-05-21T10:00:00Z", last_status="success")
assert compute_job_status(job) == "ok"
def test_error_returns_error(self):
job = CronJob(name="j", schedule="* * *", enabled=True,
last_run="2026-05-21T10:00:00Z", last_status="error")
assert compute_job_status(job) == "error"
def test_unknown_status_returns_unknown(self):
job = CronJob(name="j", schedule="* * *", enabled=True,
last_run="2026-05-21T10:00:00Z", last_status="running")
assert compute_job_status(job) in {"running", "unknown", "ok"}

View File

@ -0,0 +1,184 @@
# ruff: noqa: INP001
"""Unit tests for system_health service helpers.
All tests are pure-Python no gateway connection required.
"""
from __future__ import annotations
from datetime import datetime, timedelta
import pytest
from app.services.openclaw.system_health import (
HealthSnapshot,
HealthHistory,
parse_health_response,
)
# ---------------------------------------------------------------------------
# Fixtures
# ---------------------------------------------------------------------------
FIXTURE_FULL = {
"cpu": {"usage": 45.2, "cores": 8, "loadAvg": [1.2, 1.5, 1.8]},
"memory": {"used": 4_294_967_296, "total": 16_777_216_000, "percent": 25.6},
"disk": {"used": 50_000_000_000, "total": 500_000_000_000, "percent": 10.0},
"uptime": 86400,
"platform": "linux",
"hostname": "gateway-host",
}
FIXTURE_ALT_KEYS = {
"cpuUsage": 33.3,
"memUsed": 2_000_000_000,
"memTotal": 8_000_000_000,
"diskUsed": 100_000_000_000,
"diskTotal": 1_000_000_000_000,
"uptimeSeconds": 3600,
}
FIXTURE_MINIMAL = {"uptime": 120}
FIXTURE_EMPTY = {}
FIXTURE_NONE = None
# ---------------------------------------------------------------------------
# parse_health_response
# ---------------------------------------------------------------------------
class TestParseHealthResponse:
def test_full_response_parsed(self):
snap = parse_health_response(FIXTURE_FULL)
assert isinstance(snap, HealthSnapshot)
def test_cpu_percent(self):
snap = parse_health_response(FIXTURE_FULL)
assert snap.cpu_pct == pytest.approx(45.2)
def test_memory_percent(self):
snap = parse_health_response(FIXTURE_FULL)
assert snap.memory_pct == pytest.approx(25.6)
def test_disk_percent(self):
snap = parse_health_response(FIXTURE_FULL)
assert snap.disk_pct == pytest.approx(10.0)
def test_uptime_seconds(self):
snap = parse_health_response(FIXTURE_FULL)
assert snap.uptime_seconds == 86400
def test_hostname(self):
snap = parse_health_response(FIXTURE_FULL)
assert snap.hostname == "gateway-host"
def test_alt_key_cpu(self):
snap = parse_health_response(FIXTURE_ALT_KEYS)
assert snap.cpu_pct == pytest.approx(33.3)
def test_alt_key_memory_computed(self):
snap = parse_health_response(FIXTURE_ALT_KEYS)
# 2GB / 8GB = 25%
assert snap.memory_pct == pytest.approx(25.0, abs=1)
def test_alt_key_disk_computed(self):
snap = parse_health_response(FIXTURE_ALT_KEYS)
# 100GB / 1000GB = 10%
assert snap.disk_pct == pytest.approx(10.0, abs=1)
def test_alt_key_uptime(self):
snap = parse_health_response(FIXTURE_ALT_KEYS)
assert snap.uptime_seconds == 3600
def test_minimal_response(self):
snap = parse_health_response(FIXTURE_MINIMAL)
assert snap.uptime_seconds == 120
assert snap.cpu_pct is None
assert snap.memory_pct is None
def test_empty_response(self):
snap = parse_health_response(FIXTURE_EMPTY)
assert snap.cpu_pct is None
assert snap.memory_pct is None
assert snap.uptime_seconds is None
def test_none_response(self):
snap = parse_health_response(FIXTURE_NONE)
assert snap is not None # always returns a snapshot
assert snap.cpu_pct is None
def test_recorded_at_is_set(self):
snap = parse_health_response(FIXTURE_FULL)
assert isinstance(snap.recorded_at, datetime)
# ---------------------------------------------------------------------------
# HealthHistory
# ---------------------------------------------------------------------------
class TestHealthHistory:
def _make_snap(self, offset_hours: float = 0) -> HealthSnapshot:
from app.core.time import utcnow
snap = parse_health_response(FIXTURE_FULL)
snap.recorded_at = utcnow() - timedelta(hours=offset_hours)
return snap
def test_add_snapshot(self):
history = HealthHistory()
history.add("gw-1", self._make_snap())
assert len(history.get("gw-1")) == 1
def test_multiple_snapshots(self):
history = HealthHistory()
for _ in range(5):
history.add("gw-1", self._make_snap())
assert len(history.get("gw-1")) == 5
def test_old_snapshots_pruned(self):
history = HealthHistory(window_hours=24)
# Add a snapshot 25 hours old
old = self._make_snap(offset_hours=25)
history.add("gw-1", old)
# Add a recent snapshot
history.add("gw-1", self._make_snap())
snaps = history.get("gw-1")
assert len(snaps) == 1 # old one pruned
def test_different_gateways_isolated(self):
history = HealthHistory()
history.add("gw-1", self._make_snap())
history.add("gw-2", self._make_snap())
assert len(history.get("gw-1")) == 1
assert len(history.get("gw-2")) == 1
def test_unknown_gateway_returns_empty(self):
history = HealthHistory()
assert history.get("nonexistent") == []
def test_snapshots_ordered_oldest_first(self):
history = HealthHistory()
newer = self._make_snap(offset_hours=0)
older = self._make_snap(offset_hours=1)
history.add("gw-1", newer)
history.add("gw-1", older)
snaps = history.get("gw-1")
# oldest first
assert snaps[0].recorded_at <= snaps[1].recorded_at
def test_latest_snapshot(self):
history = HealthHistory()
old = self._make_snap(offset_hours=2)
new = self._make_snap(offset_hours=0)
history.add("gw-1", old)
history.add("gw-1", new)
latest = history.latest("gw-1")
assert latest is not None
assert latest.recorded_at >= old.recorded_at
def test_latest_none_when_empty(self):
history = HealthHistory()
assert history.latest("gw-1") is None

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,20 @@
/**
* Generated by orval v8.3.0 🍺
* Do not edit manually.
* Mission Control API
* OpenAPI spec version: 0.1.0
*/
/**
* One cron job entry returned by GET /gateways/{id}/cron.
*/
export interface CronJobRead {
name: string;
schedule?: string;
enabled?: boolean;
status: string;
last_run?: string | null;
next_run?: string | null;
last_duration_ms?: number | null;
last_error?: string | null;
}

View File

@ -0,0 +1,16 @@
/**
* Generated by orval v8.3.0 🍺
* Do not edit manually.
* Mission Control API
* OpenAPI spec version: 0.1.0
*/
import type { CronJobRead } from "./cronJobRead";
/**
* Response for GET /gateways/{id}/cron.
*/
export interface CronStatusResponse {
gateway_id: string;
generated_at: string;
jobs: CronJobRead[];
}

View File

@ -0,0 +1,9 @@
/**
* Generated by orval v8.3.0 🍺
* Do not edit manually.
* Mission Control API
* OpenAPI spec version: 0.1.0
*/
export type GetGatewayRuntimeActivityApiV1GatewaysGatewayIdRuntimeActivityGet200 =
{ [key: string]: unknown };

View File

@ -0,0 +1,26 @@
/**
* Generated by orval v8.3.0 🍺
* Do not edit manually.
* Mission Control API
* OpenAPI spec version: 0.1.0
*/
/**
* One system health reading.
*/
export interface HealthSnapshotRead {
recorded_at: string;
cpu_pct?: number | null;
memory_pct?: number | null;
memory_used_bytes?: number | null;
memory_total_bytes?: number | null;
disk_pct?: number | null;
disk_used_bytes?: number | null;
disk_total_bytes?: number | null;
uptime_seconds?: number | null;
load_avg_1m?: number | null;
load_avg_5m?: number | null;
load_avg_15m?: number | null;
hostname?: string | null;
platform?: string | null;
}

View File

@ -78,6 +78,8 @@ export * from "./boardWebhookPayloadReadPayload";
export * from "./boardWebhookRead";
export * from "./boardWebhookUpdate";
export * from "./closeIssueResponse";
export * from "./cronJobRead";
export * from "./cronStatusResponse";
export * from "./dashboardKpis";
export * from "./dashboardMetrics";
export * from "./dashboardMetricsApiV1MetricsDashboardGetParams";
@ -145,10 +147,12 @@ export * from "./getBoardGroupSnapshotApiV1BoardGroupsGroupIdSnapshotGetParams";
export * from "./getBoardGroupSnapshotApiV1BoardsBoardIdGroupSnapshotGetParams";
export * from "./getForgejoHeatmapApiV1ForgejoHeatmapGetParams";
export * from "./getForgejoMetricsApiV1ForgejoMetricsGetParams";
export * from "./getGatewayRuntimeActivityApiV1GatewaysGatewayIdRuntimeActivityGet200";
export * from "./getGatewaySessionApiV1GatewaysSessionsSessionIdGetParams";
export * from "./getSessionHistoryApiV1GatewaysSessionsSessionIdHistoryGetParams";
export * from "./getWebhookPayloadApiV1AgentBoardsBoardIdWebhooksWebhookIdPayloadsPayloadIdGetParams";
export * from "./healthHealthGet200";
export * from "./healthSnapshotRead";
export * from "./healthStatusResponse";
export * from "./healthzHealthzGet200";
export * from "./heatmapDay";
@ -254,6 +258,7 @@ export * from "./streamTaskCommentFeedApiV1ActivityTaskCommentsStreamGetParams";
export * from "./streamTasksApiV1BoardsBoardIdTasksStreamGetParams";
export * from "./syncGatewayTemplatesApiV1GatewaysGatewayIdTemplatesSyncPostParams";
export * from "./syncRepositoryIssuesApiV1ForgejoRepositoriesRepositoryIdSyncPost200";
export * from "./systemHealthResponse";
export * from "./tagCreate";
export * from "./tagRead";
export * from "./tagRef";

View File

@ -0,0 +1,18 @@
/**
* Generated by orval v8.3.0 🍺
* Do not edit manually.
* Mission Control API
* OpenAPI spec version: 0.1.0
*/
import type { HealthSnapshotRead } from "./healthSnapshotRead";
/**
* Response for GET /gateways/{id}/health.
*/
export interface SystemHealthResponse {
gateway_id: string;
generated_at: string;
current: HealthSnapshotRead;
history: HealthSnapshotRead[];
history_window_hours?: number;
}

View File

@ -32,12 +32,18 @@ import {
getGatewayRuntimeUsageApiV1GatewaysGatewayIdRuntimeUsageGet,
} from "@/api/generated/gateways/gateways";
import type { GatewaysStatusResponse } from "@/api/generated/model/gatewaysStatusResponse";
import type { RuntimeUsageResponse } from "@/api/generated/model";
import type { CronStatusResponse, RuntimeUsageResponse, SystemHealthResponse } from "@/api/generated/model";
import {
getGatewayCronApiV1GatewaysGatewayIdCronGet,
getGatewayHealthApiV1GatewaysGatewayIdHealthGet,
} from "@/api/generated/gateways/gateways";
import {
RuntimeUsageSection,
aggregateRuntimeUsage,
type AggregatedRuntimeUsage,
} from "@/components/dashboard/RuntimeUsageSection";
import { GatewayHealthPanel } from "@/components/dashboard/GatewayHealthPanel";
import { GatewayCronPanel } from "@/components/dashboard/GatewayCronPanel";
import {
type listAgentsApiV1AgentsGetResponse,
useListAgentsApiV1AgentsGet,
@ -667,6 +673,34 @@ export default function DashboardPage() {
const runtimeUsage = runtimeUsageQuery.data ?? null;
// Gateway health — query the first gateway only for the compact dashboard panel
const primaryGatewayId = gatewayTargets[0]?.gatewayId ?? null;
const gatewayHealthQuery = useQuery<SystemHealthResponse | null, ApiError>({
queryKey: ["dashboard", "gateway-health", primaryGatewayId],
enabled: Boolean(isSignedIn && primaryGatewayId),
refetchInterval: 60_000,
refetchOnMount: "always",
queryFn: () => {
if (!primaryGatewayId) return Promise.resolve(null);
return getGatewayHealthApiV1GatewaysGatewayIdHealthGet(primaryGatewayId).then(
(r) => (r.status === 200 ? (r.data as SystemHealthResponse) : null),
);
},
});
const gatewayCronQuery = useQuery<CronStatusResponse | null, ApiError>({
queryKey: ["dashboard", "gateway-cron", primaryGatewayId],
enabled: Boolean(isSignedIn && primaryGatewayId),
refetchInterval: 60_000,
refetchOnMount: "always",
queryFn: () => {
if (!primaryGatewayId) return Promise.resolve(null);
return getGatewayCronApiV1GatewaysGatewayIdCronGet(primaryGatewayId).then(
(r) => (r.status === 200 ? (r.data as CronStatusResponse) : null),
);
},
});
// Build a session-id → TopSession lookup for enriching session summaries
const topSessionById = useMemo(() => {
const map = new Map<string, { costUsd: number; totalTokens: number; model: string | null }>();
@ -1110,6 +1144,21 @@ export default function DashboardPage() {
formatTimestamp={formatTimestamp}
/>
</div>
{hasConfiguredGateways && (
<div className="mt-4 grid grid-cols-1 gap-4 md:grid-cols-2">
<GatewayHealthPanel
health={gatewayHealthQuery.data ?? null}
isLoading={gatewayHealthQuery.isLoading}
hasGateways={hasConfiguredGateways}
/>
<GatewayCronPanel
cron={gatewayCronQuery.data ?? null}
isLoading={gatewayCronQuery.isLoading}
hasGateways={hasConfiguredGateways}
/>
</div>
)}
</div>
</main>
</SignedIn>

View File

@ -0,0 +1,103 @@
"use client";
import { CheckCircle2, AlertCircle, Clock, XCircle, Loader2 } from "lucide-react";
import type { CronStatusResponse } from "@/api/generated/model";
import { DashboardSection } from "./DashboardSection";
import { DashboardEmptyState } from "./DashboardEmptyState";
interface GatewayCronPanelProps {
cron: CronStatusResponse | null;
isLoading?: boolean;
hasGateways: boolean;
}
type StatusKey = "ok" | "error" | "running" | "pending" | "disabled" | "unknown";
const STATUS_META: Record<StatusKey, { icon: React.ElementType; cls: string; label: string }> = {
ok: { icon: CheckCircle2, cls: "text-[color:var(--success)]", label: "OK" },
error: { icon: AlertCircle, cls: "text-[color:var(--danger)]", label: "Error" },
running: { icon: Loader2, cls: "text-[color:var(--accent)] animate-spin", label: "Running" },
pending: { icon: Clock, cls: "text-muted", label: "Pending" },
disabled: { icon: XCircle, cls: "text-muted opacity-40", label: "Disabled" },
unknown: { icon: Clock, cls: "text-muted", label: "Unknown" },
};
function fmtMs(ms: number | null | undefined): string {
if (ms == null) return "";
if (ms < 1000) return `${ms}ms`;
return `${(ms / 1000).toFixed(1)}s`;
}
function relativeTime(iso: string | null | undefined): string {
if (!iso) return "never";
try {
const diff = Date.now() - new Date(iso).getTime();
if (diff < 60_000) return "just now";
if (diff < 3_600_000) return `${Math.floor(diff / 60_000)}m ago`;
if (diff < 86_400_000) return `${Math.floor(diff / 3_600_000)}h ago`;
return `${Math.floor(diff / 86_400_000)}d ago`;
} catch {
return "—";
}
}
export function GatewayCronPanel({
cron,
isLoading = false,
hasGateways,
}: GatewayCronPanelProps) {
if (!hasGateways) return null;
const jobs = cron?.jobs ?? [];
return (
<DashboardSection
title="Cron Jobs"
action={jobs.length > 0 ? { label: `${jobs.length}`, href: "#" } : undefined}
>
{isLoading && !cron ? (
<DashboardEmptyState message="Loading cron data…" />
) : jobs.length === 0 ? (
<DashboardEmptyState message="No cron jobs found on this gateway." />
) : (
<div className="space-y-1.5">
{jobs.map((job) => {
const meta = STATUS_META[(job.status as StatusKey) ?? "unknown"] ?? STATUS_META.unknown;
const Icon = meta.icon;
return (
<div
key={job.name}
className="flex min-w-0 items-start gap-2 rounded-lg border border-[color:var(--border)] bg-[color:var(--surface-muted)] px-3 py-2"
>
<Icon className={`mt-0.5 h-3.5 w-3.5 shrink-0 ${meta.cls}`} />
<div className="min-w-0 flex-1">
<div className="flex items-baseline gap-2">
<p className="truncate text-sm font-medium text-strong">{job.name}</p>
{job.schedule && (
<code className="shrink-0 text-[11px] text-muted">{job.schedule}</code>
)}
</div>
<div className="mt-0.5 flex flex-wrap gap-x-3 gap-y-0.5 text-[11px] text-muted">
<span>Last: {relativeTime(job.last_run)}</span>
{job.next_run && <span>Next: {relativeTime(job.next_run)}</span>}
{job.last_duration_ms != null && (
<span>{fmtMs(job.last_duration_ms)}</span>
)}
{job.last_error && (
<span className="text-[color:var(--danger)] truncate max-w-[180px]" title={job.last_error}>
{job.last_error}
</span>
)}
</div>
</div>
<span className={`shrink-0 text-[11px] font-medium ${meta.cls}`}>
{meta.label}
</span>
</div>
);
})}
</div>
)}
</DashboardSection>
);
}

View File

@ -0,0 +1,126 @@
"use client";
import type { SystemHealthResponse } from "@/api/generated/model";
import { DashboardSection } from "./DashboardSection";
import { DashboardEmptyState } from "./DashboardEmptyState";
interface GatewayHealthPanelProps {
health: SystemHealthResponse | null;
isLoading?: boolean;
hasGateways: boolean;
}
function pctBar(pct: number | null | undefined) {
if (pct == null) return null;
const clamped = Math.min(100, Math.max(0, pct));
const color =
clamped > 90
? "bg-[color:var(--danger)]"
: clamped > 75
? "bg-[color:var(--warning)]"
: "bg-[color:var(--success)]";
return (
<div className="flex items-center gap-2">
<div className="h-1.5 flex-1 overflow-hidden rounded-full bg-[color:var(--surface-strong)]">
<div
className={`h-full rounded-full transition-all ${color}`}
style={{ width: `${clamped}%` }}
/>
</div>
<span className="w-10 text-right text-xs tabular-nums text-muted">
{clamped.toFixed(0)}%
</span>
</div>
);
}
function fmtUptime(seconds: number | null | undefined): string {
if (seconds == null) return "—";
const d = Math.floor(seconds / 86400);
const h = Math.floor((seconds % 86400) / 3600);
const m = Math.floor((seconds % 3600) / 60);
if (d > 0) return `${d}d ${h}h`;
if (h > 0) return `${h}h ${m}m`;
return `${m}m`;
}
function fmtBytes(bytes: number | null | undefined): string {
if (bytes == null) return "";
if (bytes >= 1e9) return ` (${(bytes / 1e9).toFixed(1)} GB)`;
if (bytes >= 1e6) return ` (${(bytes / 1e6).toFixed(0)} MB)`;
return "";
}
export function GatewayHealthPanel({
health,
isLoading = false,
hasGateways,
}: GatewayHealthPanelProps) {
if (!hasGateways) return null;
const c = health?.current;
return (
<DashboardSection title="Gateway Health">
{isLoading && !health ? (
<DashboardEmptyState message="Loading health data…" />
) : !health || (!c?.cpu_pct && !c?.memory_pct && !c?.uptime_seconds) ? (
<DashboardEmptyState message="Health data unavailable. The gateway may not expose system metrics." />
) : (
<div className="space-y-3">
{/* CPU */}
{c.cpu_pct != null && (
<div>
<p className="mb-1 text-xs font-medium text-muted">CPU</p>
{pctBar(c.cpu_pct)}
{c.load_avg_1m != null && (
<p className="mt-0.5 text-[11px] text-muted">
load avg {c.load_avg_1m.toFixed(2)}
{c.load_avg_5m != null ? ` / ${c.load_avg_5m.toFixed(2)}` : ""}
{c.load_avg_15m != null ? ` / ${c.load_avg_15m.toFixed(2)}` : ""}
</p>
)}
</div>
)}
{/* RAM */}
{c.memory_pct != null && (
<div>
<p className="mb-1 text-xs font-medium text-muted">
Memory{fmtBytes(c.memory_used_bytes)}
</p>
{pctBar(c.memory_pct)}
</div>
)}
{/* Disk */}
{c.disk_pct != null && (
<div>
<p className="mb-1 text-xs font-medium text-muted">
Disk{fmtBytes(c.disk_used_bytes)}
</p>
{pctBar(c.disk_pct)}
</div>
)}
{/* Uptime / hostname */}
<div className="flex flex-wrap gap-4 border-t border-[color:var(--border)] pt-3 text-xs text-muted">
{c.uptime_seconds != null && (
<span>
Uptime <span className="font-medium text-strong">{fmtUptime(c.uptime_seconds)}</span>
</span>
)}
{c.hostname && (
<span className="truncate">
Host <span className="font-medium text-strong">{c.hostname}</span>
</span>
)}
{c.platform && (
<span>{c.platform}</span>
)}
</div>
</div>
)}
</DashboardSection>
);
}