Pipeline/backend/app/services/openclaw/runtime_activity.py

"""Runtime activity service — fetch and normalize recent gateway session messages.

Data source: gateway ``chat.history`` RPC (returns recent messages per session).
This is supplemental to the DB-backed activity feed; it shows what is happening
in active gateway sessions in near-real-time without requiring any writes.

Design notes
------------
- Polling: callers poll this service on an interval; it does not maintain state.
- Deduplication: based on ``(session_key, message_index)`` because chat.history
  does not return stable message IDs.
- Redaction: known-sensitive tool argument names are blanked; large content is
  truncated to a short preview.
- Authorization: callers must have already verified gateway ownership before
  passing a config here.
"""

from __future__ import annotations

import hashlib
import re
from datetime import datetime
from typing import Any

from app.core.logging import get_logger
from app.core.time import utcnow
from app.services.openclaw.gateway_rpc import (
    GatewayConfig,
    OpenClawGatewayError,
    get_chat_history,
    openclaw_call,
)

logger = get_logger(__name__)

# ---------------------------------------------------------------------------
# Constants
# ---------------------------------------------------------------------------

CONTENT_PREVIEW_MAX = 300   # chars before truncation
HISTORY_FETCH_LIMIT = 20    # messages to fetch per session per poll
POLL_HISTORY_SESSIONS_MAX = 10  # max sessions to poll in one pass

# Argument names that should be fully redacted from tool call previews
_REDACT_TOOL_ARG_NAMES = frozenset(
    {
        "password", "passwd", "secret", "token", "api_key", "apikey",
        "access_key", "private_key", "credential", "credentials",
        "authorization", "bearer", "session_token", "refresh_token",
    }
)

# Tool names whose entire input should be summarised rather than previewed
_SUMMARISE_TOOLS = frozenset({"bash", "computer", "str_replace_editor"})

# ---------------------------------------------------------------------------
# Normalized event type
# ---------------------------------------------------------------------------

class RuntimeMessageEvent:
    """Normalized representation of one gateway session message."""

    __slots__ = (
        "event_id",
        "session_key",
        "session_label",
        "role",
        "model",
        "content_preview",
        "content_truncated",
        "has_tool_use",
        "tool_names",
        "timestamp",
        "agent_id",
        "board_id",
        "message_index",
    )

    def __init__(
        self,
        *,
        event_id: str,
        session_key: str,
        session_label: str | None,
        role: str,
        model: str | None,
        content_preview: str,
        content_truncated: bool,
        has_tool_use: bool,
        tool_names: list[str],
        timestamp: datetime | None,
        agent_id: str | None,
        board_id: str | None,
        message_index: int,
    ) -> None:
        self.event_id = event_id
        self.session_key = session_key
        self.session_label = session_label
        self.role = role
        self.model = model
        self.content_preview = content_preview
        self.content_truncated = content_truncated
        self.has_tool_use = has_tool_use
        self.tool_names = tool_names
        self.timestamp = timestamp
        self.agent_id = agent_id
        self.board_id = board_id
        self.message_index = message_index

    def to_dict(self) -> dict[str, Any]:
        return {
            "event_id": self.event_id,
            "session_key": self.session_key,
            "session_label": self.session_label,
            "role": self.role,
            "model": self.model,
            "content_preview": self.content_preview,
            "content_truncated": self.content_truncated,
            "has_tool_use": self.has_tool_use,
            "tool_names": self.tool_names,
            "timestamp": self.timestamp.isoformat() if self.timestamp else None,
            "agent_id": self.agent_id,
            "board_id": self.board_id,
            "message_index": self.message_index,
        }


# ---------------------------------------------------------------------------
# Content extraction and redaction
# ---------------------------------------------------------------------------

def _extract_text(content: object) -> tuple[str, bool]:
    """Return (preview_text, was_truncated) from a message content value."""
    if content is None:
        return "", False

    if isinstance(content, str):
        text = content
        truncated = len(text) > CONTENT_PREVIEW_MAX
        return text[:CONTENT_PREVIEW_MAX], truncated

    if isinstance(content, list):
        parts: list[str] = []
        for block in content:
            if not isinstance(block, dict):
                continue
            btype = block.get("type", "")
            if btype == "text":
                parts.append(str(block.get("text") or ""))
            elif btype == "tool_use":
                name = block.get("name", "tool")
                if name in _SUMMARISE_TOOLS:
                    parts.append(f"[tool: {name}]")
                else:
                    parts.append(f"[tool: {name}]")
            elif btype == "tool_result":
                result_content = block.get("content")
                if isinstance(result_content, str):
                    parts.append(f"[result: {result_content[:80]}]")
                else:
                    parts.append("[result]")
        combined = " ".join(p for p in parts if p)
        truncated = len(combined) > CONTENT_PREVIEW_MAX
        return combined[:CONTENT_PREVIEW_MAX], truncated

    return str(content)[:CONTENT_PREVIEW_MAX], False


def redact_tool_args(args: dict[str, Any]) -> dict[str, Any]:
    """Return a copy of tool args with sensitive keys replaced by ``[REDACTED]``."""
    if not isinstance(args, dict):
        return {}
    result: dict[str, Any] = {}
    for key, value in args.items():
        if key.lower() in _REDACT_TOOL_ARG_NAMES:
            result[key] = "[REDACTED]"
        elif isinstance(value, str) and len(value) > 500:
            result[key] = value[:200] + "…[truncated]"
        else:
            result[key] = value
    return result


def _collect_tool_names(content: object) -> list[str]:
    """Return names of all tool_use blocks in a message content."""
    if not isinstance(content, list):
        return []
    return [
        str(block.get("name") or "unknown")
        for block in content
        if isinstance(block, dict) and block.get("type") == "tool_use"
    ]


def _parse_timestamp(msg: dict[str, Any]) -> datetime | None:
    for key in ("timestamp", "created_at", "createdAt", "time"):
        val = msg.get(key)
        if isinstance(val, str) and val.strip():
            try:
                normalized = val.strip().replace("Z", "+00:00")
                from datetime import timezone
                parsed = datetime.fromisoformat(normalized)
                if parsed.tzinfo is not None:
                    return parsed.astimezone(timezone.utc).replace(tzinfo=None)
                return parsed
            except ValueError:
                pass
    return None


# ---------------------------------------------------------------------------
# Session key → agent/board correlation
# ---------------------------------------------------------------------------

_LEAD_SESSION_RE = re.compile(
    r"^agent:lead-(?P<board_id>[0-9a-fA-F-]{36}):main$"
)
_AGENT_SESSION_RE = re.compile(
    r"^agent:(?P<agent_slug>[^:]+):(?:main|board-(?P<board_id>[0-9a-fA-F-]{36}))$"
)


def _correlate_session(session_key: str) -> tuple[str | None, str | None]:
    """Return (agent_slug_or_none, board_id_or_none) inferred from the session key."""
    lead_m = _LEAD_SESSION_RE.match(session_key)
    if lead_m:
        return None, lead_m.group("board_id")
    agent_m = _AGENT_SESSION_RE.match(session_key)
    if agent_m:
        return agent_m.group("agent_slug"), agent_m.group("board_id")
    return None, None


# ---------------------------------------------------------------------------
# Message normaliser
# ---------------------------------------------------------------------------

def normalize_message(
    session_key: str,
    session_label: str | None,
    msg: dict[str, Any],
    index: int,
) -> RuntimeMessageEvent:
    """Convert one raw chat history message into a RuntimeMessageEvent."""
    role = str(msg.get("role") or "unknown")
    model = msg.get("model") or None
    if model:
        model = str(model)
    content = msg.get("content")
    preview, truncated = _extract_text(content)
    tool_names = _collect_tool_names(content)
    ts = _parse_timestamp(msg)
    agent_id, board_id = _correlate_session(session_key)

    # Stable deduplication key
    event_id = hashlib.sha256(
        f"{session_key}:{index}:{role}:{preview[:50]}".encode()
    ).hexdigest()[:16]

    return RuntimeMessageEvent(
        event_id=event_id,
        session_key=session_key,
        session_label=session_label,
        role=role,
        model=model,
        content_preview=preview,
        content_truncated=truncated,
        has_tool_use=bool(tool_names),
        tool_names=tool_names,
        timestamp=ts,
        agent_id=agent_id,
        board_id=board_id,
        message_index=index,
    )


# ---------------------------------------------------------------------------
# Gateway data fetching
# ---------------------------------------------------------------------------

async def _safe_chat_history(
    session_key: str,
    config: GatewayConfig,
    limit: int = HISTORY_FETCH_LIMIT,
) -> list[dict[str, Any]]:
    """Fetch chat history for one session; return [] on any error."""
    try:
        raw = await get_chat_history(session_key, config, limit=limit)
        if isinstance(raw, dict):
            messages = raw.get("messages") or raw.get("history") or []
        elif isinstance(raw, list):
            messages = raw
        else:
            return []
        return [m for m in messages if isinstance(m, dict)]
    except (OpenClawGatewayError, TimeoutError, OSError, RuntimeError) as exc:
        logger.debug(
            "runtime_activity.history_fetch_failed session_key=%s error=%s",
            session_key,
            exc,
        )
        return []


async def _list_active_sessions(config: GatewayConfig) -> list[dict[str, Any]]:
    """Return list of active session dicts from the gateway."""
    try:
        raw = await openclaw_call("sessions.list", {"limit": 50}, config=config)
        if isinstance(raw, dict):
            return [s for s in (raw.get("sessions") or []) if isinstance(s, dict)]
        if isinstance(raw, list):
            return [s for s in raw if isinstance(s, dict)]
    except (OpenClawGatewayError, TimeoutError, OSError, RuntimeError) as exc:
        logger.debug("runtime_activity.sessions_list_failed error=%s", exc)
    return []


# ---------------------------------------------------------------------------
# Main poll function
# ---------------------------------------------------------------------------

async def fetch_recent_events(
    config: GatewayConfig,
    *,
    since_ids: set[str] | None = None,
    max_sessions: int = POLL_HISTORY_SESSIONS_MAX,
    history_limit: int = HISTORY_FETCH_LIMIT,
) -> list[RuntimeMessageEvent]:
    """Fetch and normalize recent messages across all active sessions.

    Args:
        config: Gateway credentials/URL.
        since_ids: Set of event_ids already seen; new events not in this set
            are returned.  Pass ``None`` for the initial load (returns all).
        max_sessions: Cap on how many sessions to query per call.
        history_limit: Number of messages to fetch per session.

    Returns:
        List of new RuntimeMessageEvent objects, oldest-first.
    """
    sessions = await _list_active_sessions(config)
    sessions = sessions[:max_sessions]

    events: list[RuntimeMessageEvent] = []
    for session in sessions:
        key = session.get("key") or session.get("id")
        if not isinstance(key, str) or not key.strip():
            continue
        label = session.get("label") or session.get("name") or None

        messages = await _safe_chat_history(key, config, limit=history_limit)
        for idx, msg in enumerate(messages):
            event = normalize_message(key, label, msg, idx)
            if since_ids is None or event.event_id not in since_ids:
                events.append(event)

    # Sort by timestamp (nulls last), then by session_key + index for stability
    def sort_key(e: RuntimeMessageEvent) -> tuple:
        return (
            e.timestamp or datetime.min,
            e.session_key,
            e.message_index,
        )

    events.sort(key=sort_key)
    return events