Pipeline/backend/app/services/openclaw/runtime_usage.py

"""Runtime usage service — compute model spend, burn rate, and time remaining.

Data source: gateway RPC methods ``usage.cost`` and ``usage.status``.
Pricing: built-in defaults plus an optional JSON override file pointed to by
the ``RUNTIME_USAGE_PRICING_FILE`` environment variable.

Design decisions:
- Total tokens (input + output) are the basis for time-remaining predictions.
- All org members may call this endpoint (not admin-only).
- Parsing is fully defensive: malformed or missing fields default to zero.
- Unknown paid models are flagged ``unpriced=True`` so the UI can warn.
- Ollama / local models are flagged ``unpriced=False, cost_usd=0`` (free by design).
"""

from __future__ import annotations

import asyncio
import json
import os
import re
from datetime import datetime, timedelta, timezone
from typing import Any
from uuid import UUID

from app.core.logging import get_logger
from app.core.time import utcnow
from app.schemas.runtime_usage import (
    ModelUsageEntry,
    RuntimeUsageBurnRate,
    RuntimeUsageCurrent,
    RuntimeUsagePredictions,
    RuntimeUsageResponse,
    RuntimeUsageWindow,
    TopSession,
)
from app.services.openclaw.gateway_rpc import GatewayConfig as GatewayClientConfig
from app.services.openclaw.gateway_rpc import OpenClawGatewayError, openclaw_call

logger = get_logger(__name__)

# ---------------------------------------------------------------------------
# Pricing config (USD per million tokens)
# ---------------------------------------------------------------------------

_PricingEntry = dict[str, float]  # keys: input, output, cache_read, cache_write

DEFAULT_MODEL_PRICING: dict[str, _PricingEntry] = {
    # Anthropic — Claude 4.x
    # Opus cache_write = $18.75/MTok (5x input price, per Anthropic docs)
    "anthropic/claude-opus-4-7":    {"input": 15.00, "output": 75.00, "cache_read": 1.50,  "cache_write": 18.75},
    "anthropic/claude-opus-4-5":    {"input": 15.00, "output": 75.00, "cache_read": 1.50,  "cache_write": 18.75},
    "anthropic/claude-sonnet-4-6":  {"input":  3.00, "output": 15.00, "cache_read": 0.30,  "cache_write":  3.75},
    "anthropic/claude-sonnet-4-5":  {"input":  3.00, "output": 15.00, "cache_read": 0.30,  "cache_write":  3.75},
    "anthropic/claude-haiku-4-5":   {"input":  0.80, "output":  4.00, "cache_read": 0.08,  "cache_write":  1.00},
    # Anthropic — Claude 3.x
    "anthropic/claude-3-5-sonnet":  {"input":  3.00, "output": 15.00, "cache_read": 0.30,  "cache_write":  3.75},
    "anthropic/claude-3-5-haiku":   {"input":  0.80, "output":  4.00, "cache_read": 0.08,  "cache_write":  1.00},
    "anthropic/claude-3-opus":      {"input": 15.00, "output": 75.00, "cache_read": 1.50,  "cache_write": 18.75},
    "anthropic/claude-3-sonnet":    {"input":  3.00, "output": 15.00, "cache_read": 0.30,  "cache_write":  3.75},
    "anthropic/claude-3-haiku":     {"input":  0.25, "output":  1.25, "cache_read": 0.03,  "cache_write":  0.30},
    # OpenAI — GPT-4.1 family (2025)
    "openai/gpt-4.1":               {"input":  2.00, "output":  8.00, "cache_read": 0.50,  "cache_write": 0.00},
    "openai/gpt-4.1-mini":          {"input":  0.40, "output":  1.60, "cache_read": 0.10,  "cache_write": 0.00},
    "openai/gpt-4.1-nano":          {"input":  0.10, "output":  0.40, "cache_read": 0.025, "cache_write": 0.00},
    # OpenAI — GPT-4o family
    "openai/gpt-4o":                {"input":  2.50, "output": 10.00, "cache_read": 1.25,  "cache_write": 0.00},
    "openai/gpt-4o-mini":           {"input":  0.15, "output":  0.60, "cache_read": 0.075, "cache_write": 0.00},
    "openai/gpt-4-5":               {"input":  75.0, "output": 150.0, "cache_read": 37.50, "cache_write": 0.00},
    "openai/gpt-4-turbo":           {"input": 10.00, "output": 30.00, "cache_read": 0.00,  "cache_write": 0.00},
    "openai/gpt-4":                 {"input": 30.00, "output": 60.00, "cache_read": 0.00,  "cache_write": 0.00},
    "openai/gpt-3-5-turbo":         {"input":  0.50, "output":  1.50, "cache_read": 0.00,  "cache_write": 0.00},
    # OpenAI — o-series reasoning
    "openai/o1":                    {"input": 15.00, "output": 60.00, "cache_read": 7.50,  "cache_write": 0.00},
    "openai/o1-mini":               {"input":  3.00, "output": 12.00, "cache_read": 1.50,  "cache_write": 0.00},
    "openai/o3":                    {"input": 10.00, "output": 40.00, "cache_read": 2.50,  "cache_write": 0.00},
    "openai/o3-mini":               {"input":  1.10, "output":  4.40, "cache_read": 0.55,  "cache_write": 0.00},
    "openai/o4-mini":               {"input":  1.10, "output":  4.40, "cache_read": 0.275, "cache_write": 0.00},
    # Codex alias (free under Codex plan)
    "openai/codex":                 {"input":  0.00, "output":  0.00, "cache_read": 0.00,  "cache_write": 0.00},
}

_pricing_cache: dict[str, _PricingEntry] | None = None


def load_pricing() -> dict[str, _PricingEntry]:
    """Return merged pricing: defaults + optional override file.

    Override file may use either shape:
      { "provider/model": { "input": X, "output": Y, ... } }
      { "rates_usd_per_million": { "provider/model": { ... } } }
    """
    global _pricing_cache
    if _pricing_cache is not None:
        return _pricing_cache
    merged = dict(DEFAULT_MODEL_PRICING)
    override_path = os.getenv("RUNTIME_USAGE_PRICING_FILE", "").strip()
    if override_path:
        try:
            with open(override_path) as fh:
                raw = json.load(fh)
            if isinstance(raw, dict):
                # Unwrap reference-dashboard shape if present
                overrides: dict[str, Any] = raw.get("rates_usd_per_million", raw)
                if isinstance(overrides, dict):
                    merged.update(overrides)
                    logger.info("runtime_usage.pricing.override_loaded path=%s count=%d", override_path, len(overrides))
        except Exception as exc:
            logger.warning("runtime_usage.pricing.override_failed path=%s error=%s", override_path, exc)
    _pricing_cache = merged
    return _pricing_cache


# ---------------------------------------------------------------------------
# Provider / model normalisation
# ---------------------------------------------------------------------------

_PROVIDER_ALIASES: dict[str, str] = {
    "anthropic": "anthropic",
    "claude": "anthropic",
    "openai": "openai",
    "codex": "openai",
    "ollama": "ollama",
    "local": "ollama",
    "gemini": "google",
    "google": "google",
}

_MODEL_STRIP_RE = re.compile(
    r"(-\d{8}|-latest|-preview|-instruct|-chat|-v\d+(\.\d+)*)$",
    re.IGNORECASE,
)


def normalize_provider(raw: str) -> str:
    """Normalise a provider string to a canonical lower-case slug."""
    cleaned = raw.strip().lower()
    return _PROVIDER_ALIASES.get(cleaned, cleaned or "unknown")


def normalize_model(raw: str) -> str:
    """Strip date stamps, version tags, and known suffixes from a model name."""
    cleaned = raw.strip().lower()
    # Remove provider prefix if present (e.g. "anthropic/claude-sonnet-4-5")
    if "/" in cleaned:
        cleaned = cleaned.split("/", 1)[1]
    # Strip trailing date/version stamps
    cleaned = _MODEL_STRIP_RE.sub("", cleaned)
    return cleaned or raw.strip().lower()


def model_key(provider: str, model: str) -> str:
    """Return the canonical dict key ``"provider/model"``."""
    return f"{provider}/{model}"


# ---------------------------------------------------------------------------
# Cost estimation
# ---------------------------------------------------------------------------

def estimate_cost(
    provider: str,
    model: str,
    input_tokens: int,
    output_tokens: int,
    cache_read_tokens: int = 0,
    cache_write_tokens: int = 0,
) -> tuple[float, bool]:
    """Return ``(cost_usd, unpriced)`` for the given token counts.

    ``unpriced=True`` means we have no price for this paid model — the caller
    should surface a warning.  Ollama / local models return ``(0.0, False)``.
    """
    pricing = load_pricing()
    key = model_key(provider, model)
    entry = pricing.get(key)

    if entry is None:
        if provider == "ollama":
            return 0.0, False
        return 0.0, True  # unknown paid model

    per_m = 1_000_000
    cost = (
        input_tokens       * entry.get("input",       0.0) / per_m
        + output_tokens      * entry.get("output",      0.0) / per_m
        + cache_read_tokens  * entry.get("cache_read",  0.0) / per_m
        + cache_write_tokens * entry.get("cache_write", 0.0) / per_m
    )
    return round(cost, 8), False


# ---------------------------------------------------------------------------
# Gateway RPC helpers
# ---------------------------------------------------------------------------

async def _safe_call(
    method: str,
    config: GatewayClientConfig,
) -> dict[str, Any]:
    """Call a gateway method and return a dict, or an empty dict on any error."""
    try:
        result = await openclaw_call(method, config=config)
        if isinstance(result, dict):
            return result
        logger.debug("runtime_usage.rpc.unexpected_type method=%s type=%s", method, type(result).__name__)
        return {}
    except OpenClawGatewayError as exc:
        logger.debug("runtime_usage.rpc.gateway_error method=%s error=%s", method, exc)
        return {}
    except Exception as exc:
        logger.warning("runtime_usage.rpc.error method=%s error=%s", method, exc)
        return {}


def _get_float(d: dict[str, Any], *keys: str, default: float = 0.0) -> float:
    for key in keys:
        val = d.get(key)
        if val is not None:
            try:
                return float(val)
            except (TypeError, ValueError):
                pass
    return default


def _get_int(d: dict[str, Any], *keys: str, default: int = 0) -> int:
    return int(_get_float(d, *keys, default=float(default)))


def _get_str(d: dict[str, Any], *keys: str, default: str = "") -> str:
    for key in keys:
        val = d.get(key)
        if isinstance(val, str) and val.strip():
            return val.strip()
    return default


def _get_explicit_cost(session: dict[str, Any]) -> float:
    """Return the explicit provider/runtime cost for a session, or 0.0 if absent.

    Priority (reference dashboard order):
      1. session["usage"]["cost"]["total"]  — explicit cost from provider/runtime
      2. session["usage"]["cost"]           — flat cost in usage block
      3. session["cost"] / session["cost_usd"] / session["costUsd"]

    A value of 0.0 means "not present or zero" — callers should fall back to a
    local price-table estimate in that case.  Never overwrite a positive explicit
    cost with a local estimate.
    """
    usage = session.get("usage")
    if isinstance(usage, dict):
        cost_block = usage.get("cost")
        if isinstance(cost_block, dict):
            total = cost_block.get("total")
            if isinstance(total, (int, float)) and total > 0:
                return float(total)
        if isinstance(cost_block, (int, float)) and cost_block > 0:
            return float(cost_block)
    return _get_float(session, "cost", "cost_usd", "costUsd", default=0.0)


def _parse_datetime(value: object) -> datetime | None:
    if not isinstance(value, str) or not value.strip():
        return None
    normalized = value.strip().replace("Z", "+00:00")
    try:
        parsed = datetime.fromisoformat(normalized)
        if parsed.tzinfo is not None:
            return parsed.astimezone(timezone.utc).replace(tzinfo=None)
        return parsed
    except ValueError:
        return None


def _parse_rate_limit_reset_value(value: object, now: datetime) -> datetime | None:
    """Parse rate-limit reset values from headers into a UTC naive datetime.

    Supports:
    - RFC3339/ISO timestamps
    - Unix epoch seconds / milliseconds
    - Delta-seconds strings
    """
    parsed = _parse_datetime(value)
    if parsed is not None:
        return parsed

    if value is None:
        return None

    raw = str(value).strip()
    if not raw:
        return None

    try:
        numeric = float(raw)
    except ValueError:
        return None

    # Very large values are likely epoch milliseconds.
    if numeric >= 1_000_000_000_000:
        return datetime.fromtimestamp(numeric / 1000, tz=timezone.utc).replace(tzinfo=None)
    # Epoch seconds (typical range today ~1.7e9).
    if numeric >= 1_000_000_000:
        return datetime.fromtimestamp(numeric, tz=timezone.utc).replace(tzinfo=None)
    # Otherwise treat as delta-seconds until reset.
    if numeric >= 0:
        return now + timedelta(seconds=numeric)
    return None


def _extract_rate_limit_reset_at(status_raw: dict[str, Any], now: datetime) -> datetime | None:
    """Find the first parseable rate-limit reset timestamp in status data."""
    explicit_keys = (
        "x_ratelimit_reset",
        "x_ratelimit_reset_tokens",
        "x_ratelimit_reset_requests",
        "ratelimit_reset",
        "anthropic_ratelimit_reset",
        "anthropic_ratelimit_tokens_reset",
        "anthropic_ratelimit_requests_reset",
        "anthropic_ratelimit_input_tokens_reset",
    )
    for key in explicit_keys:
        if key in status_raw:
            parsed = _parse_rate_limit_reset_value(status_raw.get(key), now)
            if parsed is not None:
                return parsed

    # Defensive fallback: inspect any ratelimit*reset field that may appear.
    for key, value in status_raw.items():
        normalized = str(key).lower().replace("-", "_")
        if "ratelimit" in normalized and "reset" in normalized:
            parsed = _parse_rate_limit_reset_value(value, now)
            if parsed is not None:
                return parsed
    return None


# ---------------------------------------------------------------------------
# Session parsing
# ---------------------------------------------------------------------------

def _parse_sessions(cost_raw: dict[str, Any]) -> list[dict[str, Any]]:
    """Extract a flat list of session dicts from the usage.cost response.

    The gateway may return sessions at the top level or nested under a window
    key (``"5hour"``, ``"today"``, ``"week"``, etc.).  We prefer the most
    granular available list.
    """
    # Try flat list first
    flat = cost_raw.get("sessions")
    if isinstance(flat, list):
        return [s for s in flat if isinstance(s, dict)]

    # Try nested under window keys, in preference order
    for window_key in ("5hour", "5h", "today", "week", "7day", "data"):
        bucket = cost_raw.get(window_key)
        if isinstance(bucket, dict):
            nested = bucket.get("sessions")
            if isinstance(nested, list):
                return [s for s in nested if isinstance(s, dict)]
        if isinstance(bucket, list):
            return [s for s in bucket if isinstance(s, dict)]

    return []


def _parse_session_usage(session: dict[str, Any]) -> dict[str, int]:
    """Extract token counts from a session dict, trying multiple key conventions."""
    usage = session.get("usage") or session.get("tokens") or {}
    if not isinstance(usage, dict):
        usage = {}

    return {
        "input":       _get_int(usage, "input_tokens",        "inputTokens",        "input",        default=0)
                    or _get_int(session, "input_tokens",       "inputTokens",        default=0),
        "output":      _get_int(usage, "output_tokens",       "outputTokens",       "output",       default=0)
                    or _get_int(session, "output_tokens",      "outputTokens",       default=0),
        "cache_read":  _get_int(usage, "cache_read_input_tokens", "cacheReadTokens",  "cache_read",  default=0)
                    or _get_int(session, "cache_read_tokens",  "cacheReadTokens",    default=0),
        "cache_write": _get_int(usage, "cache_creation_input_tokens", "cacheWriteTokens", "cache_write", default=0)
                    or _get_int(session, "cache_write_tokens", "cacheWriteTokens",   default=0),
    }


# ---------------------------------------------------------------------------
# Aggregation
# ---------------------------------------------------------------------------

def aggregate_per_model(
    sessions: list[dict[str, Any]],
    account_key: str = "default",
) -> dict[str, ModelUsageEntry]:
    """Roll up token counts and cost across sessions, keyed by provider/model."""
    entries: dict[str, dict[str, Any]] = {}

    for session in sessions:
        raw_provider = _get_str(session, "provider", default="anthropic")
        raw_model    = _get_str(session, "model", "modelName", default="unknown")
        provider     = normalize_provider(raw_provider)
        model        = normalize_model(raw_model)
        key          = model_key(provider, model)

        tokens = _parse_session_usage(session)
        session_cost = _get_explicit_cost(session)
        calls        = _get_int(session, "calls", "messageCount", "messages", default=1)
        # Only estimate when the gateway provided no explicit cost
        if session_cost == 0.0:
            session_cost, _ = estimate_cost(
                provider, model,
                tokens["input"], tokens["output"],
                tokens["cache_read"], tokens["cache_write"],
            )
        _, unpriced = estimate_cost(provider, model, 0, 0)

        if key not in entries:
            entries[key] = {
                "provider": provider,
                "account_key": account_key,
                "model": model,
                "input_tokens": 0,
                "output_tokens": 0,
                "cache_read_tokens": 0,
                "cache_write_tokens": 0,
                "cost_usd": 0.0,
                "calls": 0,
                "unpriced": unpriced,
                "source": "local_jsonl_estimate",  # default source for aggregated session data
            }
        e = entries[key]
        e["input_tokens"]       += tokens["input"]
        e["output_tokens"]      += tokens["output"]
        e["cache_read_tokens"]  += tokens["cache_read"]
        e["cache_write_tokens"] += tokens["cache_write"]
        e["cost_usd"]           += session_cost
        e["calls"]              += calls

    return {
        key: ModelUsageEntry(
            **{**e, "total_tokens": e["input_tokens"] + e["output_tokens"]},
        )
        for key, e in entries.items()
    }


def _top_sessions(
    sessions: list[dict[str, Any]],
    limit: int = 10,
) -> list[TopSession]:
    rows = []
    for session in sessions:
        sid   = _get_str(session, "sessionId", "id", "session_id", default="")
        label = _get_str(session, "label", "name", "title") or None
        model = _get_str(session, "model", "modelName") or None
        if model:
            provider = normalize_provider(_get_str(session, "provider", default="anthropic"))
            model    = model_key(provider, normalize_model(model))
        tokens = _parse_session_usage(session)
        total  = tokens["input"] + tokens["output"]
        cost   = _get_explicit_cost(session)
        if cost == 0.0 and model:
            parts = model.split("/", 1)
            if len(parts) == 2:
                cost, _ = estimate_cost(
                    parts[0], parts[1],
                    tokens["input"], tokens["output"],
                    tokens["cache_read"], tokens["cache_write"],
                )
        updated = _get_str(session, "updated_at", "updatedAt", "lastActivity", "last_activity") or None
        rows.append(TopSession(
            session_id=sid,
            label=label,
            model=model,
            cost_usd=round(cost, 8),
            total_tokens=total,
            updated_at=updated,
            source="local_jsonl_estimate",  # default source for session data
        ))
    rows.sort(key=lambda r: r.cost_usd, reverse=True)
    return rows[:limit]


# ---------------------------------------------------------------------------
# Window and limit helpers
# ---------------------------------------------------------------------------

_WINDOW_HOURS = 5


def _oldest_active_ts(
    sessions: list[dict[str, Any]],
    now: datetime,
) -> datetime | None:
    """Return the oldest session timestamp still inside the rolling window, or None."""
    cutoff = now - timedelta(hours=_WINDOW_HOURS)
    oldest: datetime | None = None
    for session in sessions:
        raw_ts = _get_str(
            session,
            "updated_at", "updatedAt", "lastActivity", "last_activity",
            "created_at", "createdAt",
        )
        ts = _parse_datetime(raw_ts)
        if ts is None or ts < cutoff:
            continue
        if oldest is None or ts < oldest:
            oldest = ts
    return oldest


def _build_window(
    status_raw: dict[str, Any],
    now: datetime,
    account_key: str = "default",
    oldest_event_ts: datetime | None = None,
) -> RuntimeUsageWindow:
    """Build the usage window, preferring gateway status data then falling back.

    Source assignment:
    - If gateway status provides explicit window data, use provider_native
    - If API rate-limit headers are the only source, use provider_api_rate_limit
    - If falling back to local logic, use local_jsonl_estimate

    When falling back, `oldest_event_ts` anchors the window start to the
    oldest active session timestamp (oldest_event_ts + 5h = reset).  This
    avoids the previous bug where started_at = now - 5h made reset_in_ms = 0.
    """
    explicit_started_at = _parse_datetime(
        status_raw.get("windowStart") or status_raw.get("window_start")
        or status_raw.get("period_start") or status_raw.get("started_at")
    )
    explicit_resets_at = _parse_datetime(
        status_raw.get("windowEnd") or status_raw.get("window_end")
        or status_raw.get("period_end") or status_raw.get("resets_at")
    )
    header_resets_at = _extract_rate_limit_reset_at(status_raw, now)

    started_at = explicit_started_at
    if started_at is None:
        # Anchor to oldest active session so reset_in_ms reflects real remaining time.
        # If no sessions exist in the window, keep a conservative local fallback.
        started_at = oldest_event_ts if oldest_event_ts is not None else now - timedelta(hours=_WINDOW_HOURS)

    resets_at = explicit_resets_at or header_resets_at
    if resets_at is None:
        resets_at = started_at + timedelta(hours=_WINDOW_HOURS)

    # Label by reset-time provenance. Local fallback reset is always an estimate.
    if explicit_resets_at is not None:
        source = "provider_native"
        confidence = "high"
    elif header_resets_at is not None:
        source = "provider_api_rate_limit"
        confidence = "medium"
    else:
        source = "local_jsonl_estimate"
        confidence = "low"

    reset_delta = resets_at - now
    reset_in_ms = max(0, int(reset_delta.total_seconds() * 1000))
    return RuntimeUsageWindow(
        key=f"{_WINDOW_HOURS}h",
        started_at=started_at,
        resets_at=resets_at,
        reset_in_ms=reset_in_ms,
        source=source,
        confidence=confidence,
    )


def _limit_source(status_raw: dict[str, Any]) -> str:
    """Return the appropriate source label for a limit read from gateway status."""
    has_rate_limit_headers = (
        status_raw.get("x_ratelimit_remaining") or
        status_raw.get("x_ratelimit_limit") or
        status_raw.get("anthropic_ratelimit_remaining") or
        status_raw.get("anthropic_ratelimit_limit")
    )
    return "provider_api_rate_limit" if has_rate_limit_headers else "configured_limit"


def _pct(numerator: int | float, denominator: int | float) -> int | None:
    if not denominator:
        return None
    return int(min(100, numerator * 100 // denominator))


def _build_current(
    per_model: dict[str, ModelUsageEntry],
    status_raw: dict[str, Any],
    account_key: str = "default",
) -> RuntimeUsageCurrent:
    total_cost         = round(sum(e.cost_usd for e in per_model.values()), 8)
    total_tokens       = sum(e.total_tokens for e in per_model.values())
    total_output_tokens = sum(e.output_tokens for e in per_model.values())
    total_calls        = sum(e.calls for e in per_model.values())

    src = _limit_source(status_raw)

    # ── Explicit output-token limit ───────────────────────────────────────────
    raw_output_limit = _get_int(
        status_raw, "outputTokenLimit", "output_token_limit", default=0
    )
    output_token_limit     = raw_output_limit or None
    output_token_limit_pct = _pct(total_output_tokens, raw_output_limit)
    output_token_limit_src = src if raw_output_limit else None

    # ── Explicit total-token limit ────────────────────────────────────────────
    raw_total_limit = _get_int(
        status_raw, "totalTokenLimit", "total_token_limit", default=0
    )
    total_token_limit     = raw_total_limit or None
    total_token_limit_pct = _pct(total_tokens, raw_total_limit)
    total_token_limit_src = src if raw_total_limit else None

    # ── Message/request limit (count-based, never token-based) ───────────────
    raw_message_limit = _get_int(
        status_raw, "messageLimit", "message_limit", "requestLimit", "request_limit",
        default=0,
    )
    message_limit     = raw_message_limit or None
    message_pct       = _pct(total_calls, raw_message_limit)
    message_limit_src = src if raw_message_limit else None

    # ── Legacy token_limit (ambiguous kind — maps to tokenLimit only) ─────────
    # Do NOT fold messageLimit into this; keep units separate.
    raw_token_limit = _get_int(status_raw, "tokenLimit", "token_limit", default=0)
    token_limit     = raw_token_limit or None
    token_pct       = _pct(total_tokens, raw_token_limit)
    token_limit_src = src if raw_token_limit else None

    # If we got an explicit typed limit but no legacy one, backfill legacy
    # so existing dashboard code still works during the transition.
    if token_limit is None:
        if output_token_limit is not None:
            token_limit     = output_token_limit
            token_pct       = output_token_limit_pct
            token_limit_src = output_token_limit_src
        elif total_token_limit is not None:
            token_limit     = total_token_limit
            token_pct       = total_token_limit_pct
            token_limit_src = total_token_limit_src

    # ── Cost limit ────────────────────────────────────────────────────────────
    raw_cost_limit = _get_float(status_raw, "costLimit", "cost_limit", "costLimitUsd", default=0.0)
    cost_limit     = raw_cost_limit or None
    cost_pct       = _pct(total_cost, raw_cost_limit) if raw_cost_limit else None
    cost_limit_src = src if raw_cost_limit else None

    return RuntimeUsageCurrent(
        total_cost_usd=total_cost,
        total_tokens=total_tokens,
        total_output_tokens=total_output_tokens,
        total_calls=total_calls,
        # legacy
        token_limit=token_limit,
        token_pct=token_pct,
        cost_limit_usd=cost_limit,
        cost_pct=cost_pct,
        token_limit_source=token_limit_src,
        cost_limit_source=cost_limit_src,
        # typed
        output_token_limit=output_token_limit,
        output_token_limit_pct=output_token_limit_pct,
        output_token_limit_source=output_token_limit_src,
        total_token_limit=total_token_limit,
        total_token_limit_pct=total_token_limit_pct,
        total_token_limit_source=total_token_limit_src,
        message_limit=message_limit,
        message_pct=message_pct,
        message_limit_source=message_limit_src,
    )


def _compute_burn_rate(
    sessions: list[dict[str, Any]],
    window: RuntimeUsageWindow,
    now: datetime,
) -> RuntimeUsageBurnRate:
    """Compute tokens/min and cost/min from the most recent 60 minutes of sessions.

    Tracks total tokens (input+output) and output tokens separately so that
    predictions against output-token limits use the correct numerator.
    """
    cutoff = now - timedelta(minutes=60)
    recent_tokens = 0
    recent_output_tokens = 0
    recent_cost   = 0.0

    for session in sessions:
        raw_ts = _get_str(session, "updated_at", "updatedAt", "lastActivity", "last_activity")
        ts = _parse_datetime(raw_ts)
        if ts is None or ts < cutoff:
            continue
        tokens = _parse_session_usage(session)
        recent_tokens        += tokens["input"] + tokens["output"]
        recent_output_tokens += tokens["output"]
        recent_cost          += _get_float(session, "cost", "cost_usd", "costUsd", default=0.0)

    return RuntimeUsageBurnRate(
        tokens_per_minute=round(recent_tokens / 60, 4),
        output_tokens_per_minute=round(recent_output_tokens / 60, 4),
        cost_usd_per_minute=round(recent_cost / 60, 8),
    )


def _build_predictions(
    current: RuntimeUsageCurrent,
    burn_rate: RuntimeUsageBurnRate,
    window: RuntimeUsageWindow,
) -> RuntimeUsagePredictions:
    """Estimate time-to-limit in ms using the most constrained matching limit.

    Priority order (tightest first):
      1. output_token_limit  vs output_tokens   (burn: output_tokens_per_minute)
      2. total_token_limit   vs total_tokens    (burn: tokens_per_minute)
      3. legacy token_limit  vs total_tokens    (burn: tokens_per_minute)
      4. message_limit       vs total_calls     (constant rate = calls / window_minutes)

    Cost and request limits are not used for time-to-limit since they either
    require billing data (cost) or are not the binding constraint in practice.
    """
    candidates: list[tuple[int, str]] = []  # (time_to_limit_ms, kind)

    # ── Output-token limit ────────────────────────────────────────────────────
    if (
        current.output_token_limit is not None
        and burn_rate.output_tokens_per_minute > 0
    ):
        remaining = current.output_token_limit - current.total_output_tokens
        if remaining <= 0:
            return RuntimeUsagePredictions(time_to_limit_ms=0, safe=False, limit_kind="output_tokens")
        candidates.append((
            int(remaining / burn_rate.output_tokens_per_minute * 60_000),
            "output_tokens",
        ))

    # ── Total-token limit ─────────────────────────────────────────────────────
    if (
        current.total_token_limit is not None
        and burn_rate.tokens_per_minute > 0
    ):
        remaining = current.total_token_limit - current.total_tokens
        if remaining <= 0:
            return RuntimeUsagePredictions(time_to_limit_ms=0, safe=False, limit_kind="total_tokens")
        candidates.append((
            int(remaining / burn_rate.tokens_per_minute * 60_000),
            "total_tokens",
        ))

    # ── Legacy token_limit (only when no typed token limit) ───────────────────
    if (
        not candidates
        and current.token_limit is not None
        and burn_rate.tokens_per_minute > 0
    ):
        remaining = current.token_limit - current.total_tokens
        if remaining <= 0:
            return RuntimeUsagePredictions(time_to_limit_ms=0, safe=False, limit_kind="total_tokens")
        candidates.append((
            int(remaining / burn_rate.tokens_per_minute * 60_000),
            "total_tokens",
        ))

    # ── Message limit ─────────────────────────────────────────────────────────
    if current.message_limit is not None and current.message_limit > 0:
        # Use elapsed time (window duration - remaining) so rate reflects
        # actual usage density, not just time left in the window.
        total_window_ms = max(1, int((window.resets_at - window.started_at).total_seconds() * 1000))
        elapsed_ms = max(1, total_window_ms - window.reset_in_ms)
        elapsed_minutes = elapsed_ms / 60_000
        calls_per_minute = current.total_calls / elapsed_minutes if elapsed_minutes > 0 else 0
        if calls_per_minute > 0:
            remaining = current.message_limit - current.total_calls
            if remaining <= 0:
                return RuntimeUsagePredictions(time_to_limit_ms=0, safe=False, limit_kind="messages")
            candidates.append((
                int(remaining / calls_per_minute * 60_000),
                "messages",
            ))

    if not candidates:
        return RuntimeUsagePredictions(time_to_limit_ms=None, safe=True, limit_kind="total_tokens")

    # Pick the most constrained (smallest time) — that is what will actually block work.
    time_to_limit_ms, kind = min(candidates, key=lambda c: c[0])
    safe = time_to_limit_ms > window.reset_in_ms
    return RuntimeUsagePredictions(time_to_limit_ms=time_to_limit_ms, safe=safe, limit_kind=kind)


# ---------------------------------------------------------------------------
# Public service entry point
# ---------------------------------------------------------------------------

async def get_runtime_usage(
    gateway_id: UUID,
    config: GatewayClientConfig,
    account_key: str = "default",
) -> RuntimeUsageResponse:
    """Fetch and aggregate runtime usage for one gateway.

    Args:
        gateway_id: Pipeline gateway DB id (echoed in response).
        config: Credentials and URL for the gateway RPC connection.
        account_key: Stable identifier for this gateway's account
            (e.g. ``"claude-default"``, ``"openai-work"``).  Used in
            per-model breakdowns to keep separate accounts distinct.

    Returns:
        A fully populated ``RuntimeUsageResponse``.  All fields default to
        safe zeroes if the gateway is unreachable or returns unexpected data.
    """
    now = utcnow()

    cost_raw, status_raw = await asyncio.gather(
        _safe_call("usage.cost",   config),
        _safe_call("usage.status", config),
    )

    # sessions.list can block the gateway on large session stores; cap at 8s.
    try:
        sessions_raw = await asyncio.wait_for(
            _safe_call("sessions.list", config), timeout=8
        )
    except asyncio.TimeoutError:
        logger.warning("runtime_usage.sessions_list.timeout gateway_id=%s", gateway_id)
        sessions_raw = {}

    # Extract sessions from sessions.list response (primary source)
    # Fallback to usage.cost if sessions.list fails
    if isinstance(sessions_raw, dict):
        raw_sessions = sessions_raw.get("sessions") or []
    elif isinstance(sessions_raw, list):
        raw_sessions = sessions_raw
    else:
        raw_sessions = []

    sessions: list[dict[str, Any]] = []
    if raw_sessions:
        sessions = [s for s in raw_sessions if isinstance(s, dict)]
    else:
        sessions = _parse_sessions(cost_raw)

    merged_status = {**cost_raw, **status_raw}

    oldest_event_ts = _oldest_active_ts(sessions, now)
    per_model    = aggregate_per_model(sessions, account_key=account_key)
    window       = _build_window(merged_status, now, oldest_event_ts=oldest_event_ts)
    current   = _build_current(per_model, merged_status)
    burn_rate = _compute_burn_rate(sessions, window, now)
    predictions = _build_predictions(current, burn_rate, window)
    top       = _top_sessions(sessions)

    logger.info(
        "runtime_usage.computed gateway_id=%s sessions=%d models=%d total_cost=%.6f",
        gateway_id,
        len(sessions),
        len(per_model),
        current.total_cost_usd,
    )
    return RuntimeUsageResponse(
        generated_at=now,
        gateway_id=gateway_id,
        window=window,
        current=current,
        burn_rate=burn_rate,
        predictions=predictions,
        per_model=per_model,
        top_sessions=top,
    )