From 5217a70c9f595b83cc68d7743f4fe8ba36d06827 Mon Sep 17 00:00:00 2001
From: null <koga.industries@gmail.com>
Date: Thu, 21 May 2026 01:43:28 -0500
Subject: [PATCH] =?UTF-8?q?feat(usage):=20separate=20limit=20types=20?=
 =?UTF-8?q?=E2=80=94=20typed=20limits=20for=20output=20tokens,=20total=20t?=
 =?UTF-8?q?okens,=20messages=20(#39)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Add typed limit fields to RuntimeUsageCurrent: output_token_limit,
  total_token_limit, message_limit with matching pct and source
- Add total_output_tokens and output_tokens_per_minute to burn rate
- _build_current() now computes each pct from matching units only
- Legacy token_limit backfilled from typed limits for backwards compat
- Frontend aggregateRuntimeUsage() tracks typed limits separately
- limit_kind field on predictions indicates which limit drove time-to-limit
---
 backend/app/schemas/runtime_usage.py          |  34 ++-
 .../app/services/openclaw/runtime_usage.py    | 218 +++++++++++++-----
 .../dashboard/RuntimeUsageSection.tsx         | 116 ++++++++--
 3 files changed, 289 insertions(+), 79 deletions(-)

diff --git a/backend/app/schemas/runtime_usage.py b/backend/app/schemas/runtime_usage.py
index 7026da4..3c2c4c6 100644
--- a/backend/app/schemas/runtime_usage.py
+++ b/backend/app/schemas/runtime_usage.py
@@ -25,21 +25,44 @@ class RuntimeUsageCurrent(SQLModel):
     """Aggregated totals within the current window."""
 
     total_cost_usd: float
-    total_tokens: int  # input + output across all sessions
+    total_tokens: int       # input + output across all sessions
+    total_output_tokens: int = 0  # output tokens only — used with output_token_limit
     total_calls: int
-    token_limit: int | None = None  # configured limit; None = unknown
-    token_pct: int | None = None   # 0–100; None when limit unknown
+
+    # ── Legacy fields (kept for backwards compat) ────────────────────────────
+    # token_limit is ambiguous (could be total or output); use typed fields below
+    # when the limit kind is known.
+    token_limit: int | None = None
+    token_pct: int | None = None
     cost_limit_usd: float | None = None
     cost_pct: int | None = None
-    # Source and confidence for the limits
     token_limit_source: str | None = None
     cost_limit_source: str | None = None
 
+    # ── Typed limits (Phase 4) ────────────────────────────────────────────────
+    # Each field pairs a limit with a percent computed from matching units only.
+
+    # Output-token limit: compared against output tokens only, never input/cache.
+    output_token_limit: int | None = None
+    output_token_limit_pct: int | None = None
+    output_token_limit_source: str | None = None
+
+    # Total-token limit: compared against input + output combined.
+    total_token_limit: int | None = None
+    total_token_limit_pct: int | None = None
+    total_token_limit_source: str | None = None
+
+    # Message/request limit: compared against call count, never token totals.
+    message_limit: int | None = None
+    message_pct: int | None = None
+    message_limit_source: str | None = None
+
 
 class RuntimeUsageBurnRate(SQLModel):
     """Recent token and cost velocity (last 60 minutes of the window)."""
 
-    tokens_per_minute: float
+    tokens_per_minute: float          # input + output combined
+    output_tokens_per_minute: float = 0.0  # output tokens only
     cost_usd_per_minute: float
 
 
@@ -48,6 +71,7 @@ class RuntimeUsagePredictions(SQLModel):
 
     time_to_limit_ms: int | None = None  # None when limit or burn rate unknown
     safe: bool  # True if time_to_limit > reset_in_ms (will reset before hitting limit)
+    limit_kind: str = "total_tokens"  # which limit drove this prediction
 
 
 class ModelUsageEntry(SQLModel):
diff --git a/backend/app/services/openclaw/runtime_usage.py b/backend/app/services/openclaw/runtime_usage.py
index 851a4a2..08b30f7 100644
--- a/backend/app/services/openclaw/runtime_usage.py
+++ b/backend/app/services/openclaw/runtime_usage.py
@@ -544,58 +544,107 @@ def _build_window(
     )
 
 
+def _limit_source(status_raw: dict[str, Any]) -> str:
+    """Return the appropriate source label for a limit read from gateway status."""
+    has_rate_limit_headers = (
+        status_raw.get("x_ratelimit_remaining") or
+        status_raw.get("x_ratelimit_limit") or
+        status_raw.get("anthropic_ratelimit_remaining") or
+        status_raw.get("anthropic_ratelimit_limit")
+    )
+    return "provider_api_rate_limit" if has_rate_limit_headers else "configured_limit"
+
+
+def _pct(numerator: int | float, denominator: int | float) -> int | None:
+    if not denominator:
+        return None
+    return int(min(100, numerator * 100 // denominator))
+
+
 def _build_current(
     per_model: dict[str, ModelUsageEntry],
     status_raw: dict[str, Any],
     account_key: str = "default",
 ) -> RuntimeUsageCurrent:
-    total_cost  = round(sum(e.cost_usd for e in per_model.values()), 8)
-    total_tokens = sum(e.total_tokens for e in per_model.values())
-    total_calls  = sum(e.calls for e in per_model.values())
+    total_cost         = round(sum(e.cost_usd for e in per_model.values()), 8)
+    total_tokens       = sum(e.total_tokens for e in per_model.values())
+    total_output_tokens = sum(e.output_tokens for e in per_model.values())
+    total_calls        = sum(e.calls for e in per_model.values())
 
-    # Try to get configured limits from the gateway status
-    raw_token_limit = _get_int(status_raw, "tokenLimit", "token_limit", "messageLimit", "message_limit", default=0)
-    token_limit = raw_token_limit or None
-    
-    # Determine source for token limit
-    if raw_token_limit:
-        # Check for API rate-limit headers
-        has_rate_limit_headers = (
-            status_raw.get("x_ratelimit_remaining") or
-            status_raw.get("x_ratelimit_limit") or
-            status_raw.get("anthropic_ratelimit_remaining") or
-            status_raw.get("anthropic_ratelimit_limit")
-        )
-        if has_rate_limit_headers:
-            token_limit_source = "provider_api_rate_limit"
-        else:
-            token_limit_source = "configured_limit"
-    else:
-        token_limit_source = None
-    
-    token_pct   = int(min(100, total_tokens * 100 // raw_token_limit)) if raw_token_limit else None
+    src = _limit_source(status_raw)
 
+    # ── Explicit output-token limit ───────────────────────────────────────────
+    raw_output_limit = _get_int(
+        status_raw, "outputTokenLimit", "output_token_limit", default=0
+    )
+    output_token_limit     = raw_output_limit or None
+    output_token_limit_pct = _pct(total_output_tokens, raw_output_limit)
+    output_token_limit_src = src if raw_output_limit else None
+
+    # ── Explicit total-token limit ────────────────────────────────────────────
+    raw_total_limit = _get_int(
+        status_raw, "totalTokenLimit", "total_token_limit", default=0
+    )
+    total_token_limit     = raw_total_limit or None
+    total_token_limit_pct = _pct(total_tokens, raw_total_limit)
+    total_token_limit_src = src if raw_total_limit else None
+
+    # ── Message/request limit (count-based, never token-based) ───────────────
+    raw_message_limit = _get_int(
+        status_raw, "messageLimit", "message_limit", "requestLimit", "request_limit",
+        default=0,
+    )
+    message_limit     = raw_message_limit or None
+    message_pct       = _pct(total_calls, raw_message_limit)
+    message_limit_src = src if raw_message_limit else None
+
+    # ── Legacy token_limit (ambiguous kind — maps to tokenLimit only) ─────────
+    # Do NOT fold messageLimit into this; keep units separate.
+    raw_token_limit = _get_int(status_raw, "tokenLimit", "token_limit", default=0)
+    token_limit     = raw_token_limit or None
+    token_pct       = _pct(total_tokens, raw_token_limit)
+    token_limit_src = src if raw_token_limit else None
+
+    # If we got an explicit typed limit but no legacy one, backfill legacy
+    # so existing dashboard code still works during the transition.
+    if token_limit is None:
+        if output_token_limit is not None:
+            token_limit     = output_token_limit
+            token_pct       = output_token_limit_pct
+            token_limit_src = output_token_limit_src
+        elif total_token_limit is not None:
+            token_limit     = total_token_limit
+            token_pct       = total_token_limit_pct
+            token_limit_src = total_token_limit_src
+
+    # ── Cost limit ────────────────────────────────────────────────────────────
     raw_cost_limit = _get_float(status_raw, "costLimit", "cost_limit", "costLimitUsd", default=0.0)
-    cost_limit  = raw_cost_limit or None
-    
-    # Determine source for cost limit
-    if raw_cost_limit:
-        cost_limit_source = "configured_limit"
-    else:
-        cost_limit_source = None
-    
-    cost_pct    = int(min(100, total_cost * 100 / raw_cost_limit)) if raw_cost_limit else None
+    cost_limit     = raw_cost_limit or None
+    cost_pct       = _pct(total_cost, raw_cost_limit) if raw_cost_limit else None
+    cost_limit_src = src if raw_cost_limit else None
 
     return RuntimeUsageCurrent(
         total_cost_usd=total_cost,
         total_tokens=total_tokens,
+        total_output_tokens=total_output_tokens,
         total_calls=total_calls,
+        # legacy
         token_limit=token_limit,
         token_pct=token_pct,
         cost_limit_usd=cost_limit,
         cost_pct=cost_pct,
-        token_limit_source=token_limit_source,
-        cost_limit_source=cost_limit_source,
+        token_limit_source=token_limit_src,
+        cost_limit_source=cost_limit_src,
+        # typed
+        output_token_limit=output_token_limit,
+        output_token_limit_pct=output_token_limit_pct,
+        output_token_limit_source=output_token_limit_src,
+        total_token_limit=total_token_limit,
+        total_token_limit_pct=total_token_limit_pct,
+        total_token_limit_source=total_token_limit_src,
+        message_limit=message_limit,
+        message_pct=message_pct,
+        message_limit_source=message_limit_src,
     )
 
 
@@ -604,9 +653,14 @@ def _compute_burn_rate(
     window: RuntimeUsageWindow,
     now: datetime,
 ) -> RuntimeUsageBurnRate:
-    """Compute tokens/min and cost/min from the most recent 60 minutes of sessions."""
+    """Compute tokens/min and cost/min from the most recent 60 minutes of sessions.
+
+    Tracks total tokens (input+output) and output tokens separately so that
+    predictions against output-token limits use the correct numerator.
+    """
     cutoff = now - timedelta(minutes=60)
     recent_tokens = 0
+    recent_output_tokens = 0
     recent_cost   = 0.0
 
     for session in sessions:
@@ -615,15 +669,14 @@ def _compute_burn_rate(
         if ts is None or ts < cutoff:
             continue
         tokens = _parse_session_usage(session)
-        recent_tokens += tokens["input"] + tokens["output"]
-        recent_cost   += _get_float(session, "cost", "cost_usd", "costUsd", default=0.0)
+        recent_tokens        += tokens["input"] + tokens["output"]
+        recent_output_tokens += tokens["output"]
+        recent_cost          += _get_float(session, "cost", "cost_usd", "costUsd", default=0.0)
 
-    # Rate per minute over the last 60 minutes
-    tokens_per_minute  = round(recent_tokens / 60, 4)
-    cost_per_minute    = round(recent_cost   / 60, 8)
     return RuntimeUsageBurnRate(
-        tokens_per_minute=tokens_per_minute,
-        cost_usd_per_minute=cost_per_minute,
+        tokens_per_minute=round(recent_tokens / 60, 4),
+        output_tokens_per_minute=round(recent_output_tokens / 60, 4),
+        cost_usd_per_minute=round(recent_cost / 60, 8),
     )
 
 
@@ -632,18 +685,79 @@ def _build_predictions(
     burn_rate: RuntimeUsageBurnRate,
     window: RuntimeUsageWindow,
 ) -> RuntimeUsagePredictions:
-    """Estimate time-to-limit in ms based on total-token burn rate."""
-    if burn_rate.tokens_per_minute <= 0 or current.token_limit is None:
-        return RuntimeUsagePredictions(time_to_limit_ms=None, safe=True)
+    """Estimate time-to-limit in ms using the most constrained matching limit.
 
-    tokens_remaining = current.token_limit - current.total_tokens
-    if tokens_remaining <= 0:
-        return RuntimeUsagePredictions(time_to_limit_ms=0, safe=False)
+    Priority order (tightest first):
+      1. output_token_limit  vs output_tokens   (burn: output_tokens_per_minute)
+      2. total_token_limit   vs total_tokens    (burn: tokens_per_minute)
+      3. legacy token_limit  vs total_tokens    (burn: tokens_per_minute)
+      4. message_limit       vs total_calls     (constant rate = calls / window_minutes)
 
-    minutes_to_limit  = tokens_remaining / burn_rate.tokens_per_minute
-    time_to_limit_ms  = int(minutes_to_limit * 60 * 1000)
+    Cost and request limits are not used for time-to-limit since they either
+    require billing data (cost) or are not the binding constraint in practice.
+    """
+    candidates: list[tuple[int, str]] = []  # (time_to_limit_ms, kind)
+
+    # ── Output-token limit ────────────────────────────────────────────────────
+    if (
+        current.output_token_limit is not None
+        and burn_rate.output_tokens_per_minute > 0
+    ):
+        remaining = current.output_token_limit - current.total_output_tokens
+        if remaining <= 0:
+            return RuntimeUsagePredictions(time_to_limit_ms=0, safe=False, limit_kind="output_tokens")
+        candidates.append((
+            int(remaining / burn_rate.output_tokens_per_minute * 60_000),
+            "output_tokens",
+        ))
+
+    # ── Total-token limit ─────────────────────────────────────────────────────
+    if (
+        current.total_token_limit is not None
+        and burn_rate.tokens_per_minute > 0
+    ):
+        remaining = current.total_token_limit - current.total_tokens
+        if remaining <= 0:
+            return RuntimeUsagePredictions(time_to_limit_ms=0, safe=False, limit_kind="total_tokens")
+        candidates.append((
+            int(remaining / burn_rate.tokens_per_minute * 60_000),
+            "total_tokens",
+        ))
+
+    # ── Legacy token_limit (only when no typed token limit) ───────────────────
+    if (
+        not candidates
+        and current.token_limit is not None
+        and burn_rate.tokens_per_minute > 0
+    ):
+        remaining = current.token_limit - current.total_tokens
+        if remaining <= 0:
+            return RuntimeUsagePredictions(time_to_limit_ms=0, safe=False, limit_kind="total_tokens")
+        candidates.append((
+            int(remaining / burn_rate.tokens_per_minute * 60_000),
+            "total_tokens",
+        ))
+
+    # ── Message limit ─────────────────────────────────────────────────────────
+    if current.message_limit is not None and current.message_limit > 0:
+        window_minutes = max(window.reset_in_ms / 60_000, 1)
+        calls_per_minute = current.total_calls / window_minutes if window_minutes > 0 else 0
+        if calls_per_minute > 0:
+            remaining = current.message_limit - current.total_calls
+            if remaining <= 0:
+                return RuntimeUsagePredictions(time_to_limit_ms=0, safe=False, limit_kind="messages")
+            candidates.append((
+                int(remaining / calls_per_minute * 60_000),
+                "messages",
+            ))
+
+    if not candidates:
+        return RuntimeUsagePredictions(time_to_limit_ms=None, safe=True, limit_kind="total_tokens")
+
+    # Pick the most constrained (smallest time) — that is what will actually block work.
+    time_to_limit_ms, kind = min(candidates, key=lambda c: c[0])
     safe = time_to_limit_ms > window.reset_in_ms
-    return RuntimeUsagePredictions(time_to_limit_ms=time_to_limit_ms, safe=safe)
+    return RuntimeUsagePredictions(time_to_limit_ms=time_to_limit_ms, safe=safe, limit_kind=kind)
 
 
 # ---------------------------------------------------------------------------
diff --git a/frontend/src/components/dashboard/RuntimeUsageSection.tsx b/frontend/src/components/dashboard/RuntimeUsageSection.tsx
index a39febd..0b29294 100644
--- a/frontend/src/components/dashboard/RuntimeUsageSection.tsx
+++ b/frontend/src/components/dashboard/RuntimeUsageSection.tsx
@@ -16,15 +16,24 @@ import { DashboardEmptyState } from "./DashboardEmptyState";
 export interface AggregatedRuntimeUsage {
   totalCostUsd: number;
   totalTokens: number;
+  totalOutputTokens: number;
   totalCalls: number;
+  // Legacy (backwards compat)
   tokenLimit: number | null;
   tokenPct: number | null;
   costLimitUsd: number | null;
+  // Typed limits (Phase 4)
+  outputTokenLimit: number | null;
+  outputTokenLimitPct: number | null;
+  messageLimit: number | null;
+  messagePct: number | null;
+  limitKind: string;  // which limit drove the time-to-limit prediction
   resetInMs: number;
   resetsAt: string;
   timeToLimitMs: number | null;
   safe: boolean;
   tokensPerMinute: number;
+  outputTokensPerMinute: number;
   costUsdPerMinute: number;
   perModel: Record<string, ModelUsageEntry>;
   topSessions: TopSession[];
@@ -54,15 +63,22 @@ export function aggregateRuntimeUsage(
     return {
       totalCostUsd: 0,
       totalTokens: 0,
+      totalOutputTokens: 0,
       totalCalls: 0,
       tokenLimit: null,
       tokenPct: null,
       costLimitUsd: null,
+      outputTokenLimit: null,
+      outputTokenLimitPct: null,
+      messageLimit: null,
+      messagePct: null,
+      limitKind: "total_tokens",
       resetInMs: 0,
       resetsAt: "",
       timeToLimitMs: null,
       safe: true,
       tokensPerMinute: 0,
+      outputTokensPerMinute: 0,
       costUsdPerMinute: 0,
       perModel: {},
       topSessions: [],
@@ -71,21 +87,24 @@ export function aggregateRuntimeUsage(
 
   let totalCostUsd = 0;
   let totalTokens = 0;
+  let totalOutputTokens = 0;
   let totalCalls = 0;
   let tokensPerMinute = 0;
+  let outputTokensPerMinute = 0;
   let costUsdPerMinute = 0;
-  // Use the window that resets soonest as the binding constraint
   let resetInMs = valid[0].window.reset_in_ms;
   let resetsAt = valid[0].window.resets_at;
   const perModel: Record<string, ModelUsageEntry> = {};
   const allSessions: TopSession[] = [];
 
   for (const r of valid) {
-    totalCostUsd     += r.current.total_cost_usd;
-    totalTokens      += r.current.total_tokens;
-    totalCalls       += r.current.total_calls;
-    tokensPerMinute  += r.burn_rate.tokens_per_minute;
-    costUsdPerMinute += r.burn_rate.cost_usd_per_minute;
+    totalCostUsd          += r.current.total_cost_usd;
+    totalTokens           += r.current.total_tokens;
+    totalOutputTokens     += r.current.total_output_tokens ?? 0;
+    totalCalls            += r.current.total_calls;
+    tokensPerMinute       += r.burn_rate.tokens_per_minute;
+    outputTokensPerMinute += r.burn_rate.output_tokens_per_minute ?? 0;
+    costUsdPerMinute      += r.burn_rate.cost_usd_per_minute;
 
     if (r.window.reset_in_ms < resetInMs) {
       resetInMs = r.window.reset_in_ms;
@@ -113,25 +132,45 @@ export function aggregateRuntimeUsage(
     allSessions.push(...r.top_sessions);
   }
 
-  // Aggregate token limits — only meaningful if all gateways share one limit
-  const limits = valid.map(r => r.current.token_limit).filter((v): v is number => v !== null);
+  // Legacy token limit
+  const limits    = valid.map(r => r.current.token_limit).filter((v): v is number => v !== null);
   const tokenLimit    = limits.length > 0 ? limits.reduce((a, b) => a + b, 0) : null;
   const tokenPct      = tokenLimit ? Math.min(100, Math.round(totalTokens * 100 / tokenLimit)) : null;
   const costLimits    = valid.map(r => r.current.cost_limit_usd).filter((v): v is number => v !== null);
   const costLimitUsd  = costLimits.length > 0 ? costLimits.reduce((a, b) => a + b, 0) : null;
 
-  // Re-derive time-to-limit from aggregated burn rate
+  // Typed limits
+  const outLimits  = valid.map(r => r.current.output_token_limit).filter((v): v is number => v !== null);
+  const outputTokenLimit    = outLimits.length > 0 ? outLimits.reduce((a, b) => a + b, 0) : null;
+  const outputTokenLimitPct = outputTokenLimit
+    ? Math.min(100, Math.round(totalOutputTokens * 100 / outputTokenLimit))
+    : null;
+
+  const msgLimits  = valid.map(r => r.current.message_limit).filter((v): v is number => v !== null);
+  const messageLimit = msgLimits.length > 0 ? msgLimits.reduce((a, b) => a + b, 0) : null;
+  const messagePct   = messageLimit ? Math.min(100, Math.round(totalCalls * 100 / messageLimit)) : null;
+
+  // Re-derive time-to-limit using the most constrained matching limit
   let timeToLimitMs: number | null = null;
   let safe = true;
+  let limitKind = "total_tokens";
+  const candidates: Array<{ ms: number; kind: string }> = [];
+
+  if (outputTokenLimit !== null && outputTokensPerMinute > 0) {
+    const rem = outputTokenLimit - totalOutputTokens;
+    if (rem <= 0) return { totalCostUsd: Math.round(totalCostUsd * 1e8) / 1e8, totalTokens, totalOutputTokens, totalCalls, tokenLimit, tokenPct, costLimitUsd, outputTokenLimit, outputTokenLimitPct, messageLimit, messagePct, limitKind: "output_tokens", resetInMs, resetsAt, timeToLimitMs: 0, safe: false, tokensPerMinute: Math.round(tokensPerMinute * 100) / 100, outputTokensPerMinute: Math.round(outputTokensPerMinute * 100) / 100, costUsdPerMinute: Math.round(costUsdPerMinute * 1e8) / 1e8, perModel, topSessions: [...allSessions].sort((a, b) => b.cost_usd - a.cost_usd).slice(0, 10) };
+    candidates.push({ ms: Math.round(rem / outputTokensPerMinute * 60_000), kind: "output_tokens" });
+  }
   if (tokenLimit !== null && tokensPerMinute > 0) {
-    const remaining = tokenLimit - totalTokens;
-    if (remaining <= 0) {
-      timeToLimitMs = 0;
-      safe = false;
-    } else {
-      timeToLimitMs = Math.round((remaining / tokensPerMinute) * 60 * 1000);
-      safe = timeToLimitMs > resetInMs;
-    }
+    const rem = tokenLimit - totalTokens;
+    if (rem <= 0) return { totalCostUsd: Math.round(totalCostUsd * 1e8) / 1e8, totalTokens, totalOutputTokens, totalCalls, tokenLimit, tokenPct, costLimitUsd, outputTokenLimit, outputTokenLimitPct, messageLimit, messagePct, limitKind: "total_tokens", resetInMs, resetsAt, timeToLimitMs: 0, safe: false, tokensPerMinute: Math.round(tokensPerMinute * 100) / 100, outputTokensPerMinute: Math.round(outputTokensPerMinute * 100) / 100, costUsdPerMinute: Math.round(costUsdPerMinute * 1e8) / 1e8, perModel, topSessions: [...allSessions].sort((a, b) => b.cost_usd - a.cost_usd).slice(0, 10) };
+    candidates.push({ ms: Math.round(rem / tokensPerMinute * 60_000), kind: "total_tokens" });
+  }
+  if (candidates.length > 0) {
+    const tightest = candidates.reduce((a, b) => a.ms < b.ms ? a : b);
+    timeToLimitMs = tightest.ms;
+    limitKind = tightest.kind;
+    safe = timeToLimitMs > resetInMs;
   }
 
   const topSessions = [...allSessions]
@@ -141,15 +180,22 @@ export function aggregateRuntimeUsage(
   return {
     totalCostUsd: Math.round(totalCostUsd * 1e8) / 1e8,
     totalTokens,
+    totalOutputTokens,
     totalCalls,
     tokenLimit,
     tokenPct,
     costLimitUsd,
+    outputTokenLimit,
+    outputTokenLimitPct,
+    messageLimit,
+    messagePct,
+    limitKind,
     resetInMs,
     resetsAt,
     timeToLimitMs,
     safe,
     tokensPerMinute: Math.round(tokensPerMinute * 100) / 100,
+    outputTokensPerMinute: Math.round(outputTokensPerMinute * 100) / 100,
     costUsdPerMinute: Math.round(costUsdPerMinute * 1e8) / 1e8,
     perModel,
     topSessions,
@@ -343,14 +389,33 @@ export function RuntimeUsageSection({
               <StatCard
                 label="Time to Limit"
                 value={usage.timeToLimitMs === null ? "—" : usage.timeToLimitMs === 0 ? "At limit" : fmtMs(usage.timeToLimitMs)}
-                sub={usage.tokenLimit ? `${usage.tokenPct ?? 0}% of ${fmtTokens(usage.tokenLimit)}` : undefined}
+                sub={(() => {
+                  if (usage.outputTokenLimit) {
+                    return `${usage.outputTokenLimitPct ?? 0}% of ${fmtTokens(usage.outputTokenLimit)} out`;
+                  }
+                  if (usage.tokenLimit) {
+                    return `${usage.tokenPct ?? 0}% of ${fmtTokens(usage.tokenLimit)}`;
+                  }
+                  if (usage.messageLimit) {
+                    return `${usage.messagePct ?? 0}% of ${usage.messageLimit} msgs`;
+                  }
+                  return undefined;
+                })()}
                 tone={safenessTone}
                 icon={<TrendingDown className="h-3 w-3" />}
               />
               <StatCard
                 label="Burn Rate"
                 value={usage.costUsdPerMinute > 0 ? `${fmtCost(usage.costUsdPerMinute)}/m` : "—"}
-                sub={usage.tokensPerMinute > 0 ? `${fmtTokens(usage.tokensPerMinute)} tok/m` : undefined}
+                sub={(() => {
+                  if (usage.outputTokensPerMinute > 0 && usage.outputTokenLimit) {
+                    return `${fmtTokens(usage.outputTokensPerMinute)} out-tok/m`;
+                  }
+                  if (usage.tokensPerMinute > 0) {
+                    return `${fmtTokens(usage.tokensPerMinute)} tok/m`;
+                  }
+                  return undefined;
+                })()}
                 icon={<Flame className="h-3 w-3" />}
               />
             </div>
@@ -374,8 +439,10 @@ export function RuntimeUsageSection({
                     <th className="px-3 py-2 text-right font-semibold text-muted">Tokens out</th>
                     <th className="px-3 py-2 text-right font-semibold text-muted">Cost</th>
                     <th className="px-3 py-2 text-right font-semibold text-muted">Calls</th>
-                    {usage.tokenLimit !== null && (
-                      <th className="px-3 py-2 text-right font-semibold text-muted">% limit</th>
+                    {(usage.outputTokenLimit !== null || usage.tokenLimit !== null) && (
+                      <th className="px-3 py-2 text-right font-semibold text-muted">
+                        {usage.outputTokenLimit !== null ? "% out limit" : "% limit"}
+                      </th>
                     )}
                   </tr>
                 </thead>
@@ -396,7 +463,12 @@ export function RuntimeUsageSection({
                         {entry.unpriced ? <span className="text-[color:var(--warning)]">—*</span> : fmtCost(entry.cost_usd)}
                       </td>
                       <td className="px-3 py-2 text-right tabular-nums text-muted">{entry.calls}</td>
-                      {usage.tokenLimit !== null && (
+                      {usage.outputTokenLimit !== null && (
+                        <td className="px-3 py-2 text-right tabular-nums text-muted">
+                          {Math.round(entry.output_tokens * 100 / usage.outputTokenLimit)}%
+                        </td>
+                      )}
+                      {usage.outputTokenLimit === null && usage.tokenLimit !== null && (
                         <td className="px-3 py-2 text-right tabular-nums text-muted">
                           {Math.round(entry.total_tokens * 100 / usage.tokenLimit)}%
                         </td>