feat(usage): separate limit types — typed limits for output tokens, total tokens, messages (#39)

- Add typed limit fields to RuntimeUsageCurrent: output_token_limit, total_token_limit, message_limit with matching pct and source - Add total_output_tokens and output_tokens_per_minute to burn rate - _build_current() now computes each pct from matching units only - Legacy token_limit backfilled from typed limits for backwards compat - Frontend aggregateRuntimeUsage() tracks typed limits separately - limit_kind field on predictions indicates which limit drove time-to-limit
2026-05-21 01:43:28 -05:00 · 2026-05-21 01:43:28 -05:00 · 5217a70c9f
parent 02eb03d408
commit 5217a70c9f
3 changed files with 289 additions and 79 deletions
--- a/backend/app/schemas/runtime_usage.py
+++ b/backend/app/schemas/runtime_usage.py
@ -25,21 +25,44 @@ class RuntimeUsageCurrent(SQLModel):
    """Aggregated totals within the current window."""

    total_cost_usd: float
-    total_tokens: int  # input + output across all sessions
+    total_tokens: int       # input + output across all sessions
+    total_output_tokens: int = 0  # output tokens only — used with output_token_limit
    total_calls: int
-    token_limit: int | None = None  # configured limit; None = unknown
-    token_pct: int | None = None   # 0–100; None when limit unknown
+
+    # ── Legacy fields (kept for backwards compat) ────────────────────────────
+    # token_limit is ambiguous (could be total or output); use typed fields below
+    # when the limit kind is known.
+    token_limit: int | None = None
+    token_pct: int | None = None
    cost_limit_usd: float | None = None
    cost_pct: int | None = None
-    # Source and confidence for the limits
    token_limit_source: str | None = None
    cost_limit_source: str | None = None

+    # ── Typed limits (Phase 4) ────────────────────────────────────────────────
+    # Each field pairs a limit with a percent computed from matching units only.
+
+    # Output-token limit: compared against output tokens only, never input/cache.
+    output_token_limit: int | None = None
+    output_token_limit_pct: int | None = None
+    output_token_limit_source: str | None = None
+
+    # Total-token limit: compared against input + output combined.
+    total_token_limit: int | None = None
+    total_token_limit_pct: int | None = None
+    total_token_limit_source: str | None = None
+
+    # Message/request limit: compared against call count, never token totals.
+    message_limit: int | None = None
+    message_pct: int | None = None
+    message_limit_source: str | None = None
+

 class RuntimeUsageBurnRate(SQLModel):
    """Recent token and cost velocity (last 60 minutes of the window)."""

-    tokens_per_minute: float
+    tokens_per_minute: float          # input + output combined
+    output_tokens_per_minute: float = 0.0  # output tokens only
    cost_usd_per_minute: float


@ -48,6 +71,7 @@ class RuntimeUsagePredictions(SQLModel):

    time_to_limit_ms: int | None = None  # None when limit or burn rate unknown
    safe: bool  # True if time_to_limit > reset_in_ms (will reset before hitting limit)
+    limit_kind: str = "total_tokens"  # which limit drove this prediction


 class ModelUsageEntry(SQLModel):
--- a/backend/app/services/openclaw/runtime_usage.py
+++ b/backend/app/services/openclaw/runtime_usage.py
@ -544,58 +544,107 @@ def _build_window(
    )


+def _limit_source(status_raw: dict[str, Any]) -> str:
+    """Return the appropriate source label for a limit read from gateway status."""
+    has_rate_limit_headers = (
+        status_raw.get("x_ratelimit_remaining") or
+        status_raw.get("x_ratelimit_limit") or
+        status_raw.get("anthropic_ratelimit_remaining") or
+        status_raw.get("anthropic_ratelimit_limit")
+    )
+    return "provider_api_rate_limit" if has_rate_limit_headers else "configured_limit"
+
+
+def _pct(numerator: int | float, denominator: int | float) -> int | None:
+    if not denominator:
+        return None
+    return int(min(100, numerator * 100 // denominator))
+
+
 def _build_current(
    per_model: dict[str, ModelUsageEntry],
    status_raw: dict[str, Any],
    account_key: str = "default",
 ) -> RuntimeUsageCurrent:
-    total_cost  = round(sum(e.cost_usd for e in per_model.values()), 8)
-    total_tokens = sum(e.total_tokens for e in per_model.values())
-    total_calls  = sum(e.calls for e in per_model.values())
+    total_cost         = round(sum(e.cost_usd for e in per_model.values()), 8)
+    total_tokens       = sum(e.total_tokens for e in per_model.values())
+    total_output_tokens = sum(e.output_tokens for e in per_model.values())
+    total_calls        = sum(e.calls for e in per_model.values())

-    # Try to get configured limits from the gateway status
-    raw_token_limit = _get_int(status_raw, "tokenLimit", "token_limit", "messageLimit", "message_limit", default=0)
-    token_limit = raw_token_limit or None
-    
-    # Determine source for token limit
-    if raw_token_limit:
-        # Check for API rate-limit headers
-        has_rate_limit_headers = (
-            status_raw.get("x_ratelimit_remaining") or
-            status_raw.get("x_ratelimit_limit") or
-            status_raw.get("anthropic_ratelimit_remaining") or
-            status_raw.get("anthropic_ratelimit_limit")
-        )
-        if has_rate_limit_headers:
-            token_limit_source = "provider_api_rate_limit"
-        else:
-            token_limit_source = "configured_limit"
-    else:
-        token_limit_source = None
-    
-    token_pct   = int(min(100, total_tokens * 100 // raw_token_limit)) if raw_token_limit else None
+    src = _limit_source(status_raw)

+    # ── Explicit output-token limit ───────────────────────────────────────────
+    raw_output_limit = _get_int(
+        status_raw, "outputTokenLimit", "output_token_limit", default=0
+    )
+    output_token_limit     = raw_output_limit or None
+    output_token_limit_pct = _pct(total_output_tokens, raw_output_limit)
+    output_token_limit_src = src if raw_output_limit else None
+
+    # ── Explicit total-token limit ────────────────────────────────────────────
+    raw_total_limit = _get_int(
+        status_raw, "totalTokenLimit", "total_token_limit", default=0
+    )
+    total_token_limit     = raw_total_limit or None
+    total_token_limit_pct = _pct(total_tokens, raw_total_limit)
+    total_token_limit_src = src if raw_total_limit else None
+
+    # ── Message/request limit (count-based, never token-based) ───────────────
+    raw_message_limit = _get_int(
+        status_raw, "messageLimit", "message_limit", "requestLimit", "request_limit",
+        default=0,
+    )
+    message_limit     = raw_message_limit or None
+    message_pct       = _pct(total_calls, raw_message_limit)
+    message_limit_src = src if raw_message_limit else None
+
+    # ── Legacy token_limit (ambiguous kind — maps to tokenLimit only) ─────────
+    # Do NOT fold messageLimit into this; keep units separate.
+    raw_token_limit = _get_int(status_raw, "tokenLimit", "token_limit", default=0)
+    token_limit     = raw_token_limit or None
+    token_pct       = _pct(total_tokens, raw_token_limit)
+    token_limit_src = src if raw_token_limit else None
+
+    # If we got an explicit typed limit but no legacy one, backfill legacy
+    # so existing dashboard code still works during the transition.
+    if token_limit is None:
+        if output_token_limit is not None:
+            token_limit     = output_token_limit
+            token_pct       = output_token_limit_pct
+            token_limit_src = output_token_limit_src
+        elif total_token_limit is not None:
+            token_limit     = total_token_limit
+            token_pct       = total_token_limit_pct
+            token_limit_src = total_token_limit_src
+
+    # ── Cost limit ────────────────────────────────────────────────────────────
    raw_cost_limit = _get_float(status_raw, "costLimit", "cost_limit", "costLimitUsd", default=0.0)
-    cost_limit  = raw_cost_limit or None
-    
-    # Determine source for cost limit
-    if raw_cost_limit:
-        cost_limit_source = "configured_limit"
-    else:
-        cost_limit_source = None
-    
-    cost_pct    = int(min(100, total_cost * 100 / raw_cost_limit)) if raw_cost_limit else None
+    cost_limit     = raw_cost_limit or None
+    cost_pct       = _pct(total_cost, raw_cost_limit) if raw_cost_limit else None
+    cost_limit_src = src if raw_cost_limit else None

    return RuntimeUsageCurrent(
        total_cost_usd=total_cost,
        total_tokens=total_tokens,
+        total_output_tokens=total_output_tokens,
        total_calls=total_calls,
+        # legacy
        token_limit=token_limit,
        token_pct=token_pct,
        cost_limit_usd=cost_limit,
        cost_pct=cost_pct,
-        token_limit_source=token_limit_source,
-        cost_limit_source=cost_limit_source,
+        token_limit_source=token_limit_src,
+        cost_limit_source=cost_limit_src,
+        # typed
+        output_token_limit=output_token_limit,
+        output_token_limit_pct=output_token_limit_pct,
+        output_token_limit_source=output_token_limit_src,
+        total_token_limit=total_token_limit,
+        total_token_limit_pct=total_token_limit_pct,
+        total_token_limit_source=total_token_limit_src,
+        message_limit=message_limit,
+        message_pct=message_pct,
+        message_limit_source=message_limit_src,
    )


@ -604,9 +653,14 @@ def _compute_burn_rate(
    window: RuntimeUsageWindow,
    now: datetime,
 ) -> RuntimeUsageBurnRate:
-    """Compute tokens/min and cost/min from the most recent 60 minutes of sessions."""
+    """Compute tokens/min and cost/min from the most recent 60 minutes of sessions.
+
+    Tracks total tokens (input+output) and output tokens separately so that
+    predictions against output-token limits use the correct numerator.
+    """
    cutoff = now - timedelta(minutes=60)
    recent_tokens = 0
+    recent_output_tokens = 0
    recent_cost   = 0.0

    for session in sessions:
@ -615,15 +669,14 @@ def _compute_burn_rate(
        if ts is None or ts < cutoff:
            continue
        tokens = _parse_session_usage(session)
-        recent_tokens += tokens["input"] + tokens["output"]
-        recent_cost   += _get_float(session, "cost", "cost_usd", "costUsd", default=0.0)
+        recent_tokens        += tokens["input"] + tokens["output"]
+        recent_output_tokens += tokens["output"]
+        recent_cost          += _get_float(session, "cost", "cost_usd", "costUsd", default=0.0)

-    # Rate per minute over the last 60 minutes
-    tokens_per_minute  = round(recent_tokens / 60, 4)
-    cost_per_minute    = round(recent_cost   / 60, 8)
    return RuntimeUsageBurnRate(
-        tokens_per_minute=tokens_per_minute,
-        cost_usd_per_minute=cost_per_minute,
+        tokens_per_minute=round(recent_tokens / 60, 4),
+        output_tokens_per_minute=round(recent_output_tokens / 60, 4),
+        cost_usd_per_minute=round(recent_cost / 60, 8),
    )


@ -632,18 +685,79 @@ def _build_predictions(
    burn_rate: RuntimeUsageBurnRate,
    window: RuntimeUsageWindow,
 ) -> RuntimeUsagePredictions:
-    """Estimate time-to-limit in ms based on total-token burn rate."""
-    if burn_rate.tokens_per_minute <= 0 or current.token_limit is None:
-        return RuntimeUsagePredictions(time_to_limit_ms=None, safe=True)
+    """Estimate time-to-limit in ms using the most constrained matching limit.

-    tokens_remaining = current.token_limit - current.total_tokens
-    if tokens_remaining <= 0:
-        return RuntimeUsagePredictions(time_to_limit_ms=0, safe=False)
+    Priority order (tightest first):
+      1. output_token_limit  vs output_tokens   (burn: output_tokens_per_minute)
+      2. total_token_limit   vs total_tokens    (burn: tokens_per_minute)
+      3. legacy token_limit  vs total_tokens    (burn: tokens_per_minute)
+      4. message_limit       vs total_calls     (constant rate = calls / window_minutes)

-    minutes_to_limit  = tokens_remaining / burn_rate.tokens_per_minute
-    time_to_limit_ms  = int(minutes_to_limit * 60 * 1000)
+    Cost and request limits are not used for time-to-limit since they either
+    require billing data (cost) or are not the binding constraint in practice.
+    """
+    candidates: list[tuple[int, str]] = []  # (time_to_limit_ms, kind)
+
+    # ── Output-token limit ────────────────────────────────────────────────────
+    if (
+        current.output_token_limit is not None
+        and burn_rate.output_tokens_per_minute > 0
+    ):
+        remaining = current.output_token_limit - current.total_output_tokens
+        if remaining <= 0:
+            return RuntimeUsagePredictions(time_to_limit_ms=0, safe=False, limit_kind="output_tokens")
+        candidates.append((
+            int(remaining / burn_rate.output_tokens_per_minute * 60_000),
+            "output_tokens",
+        ))
+
+    # ── Total-token limit ─────────────────────────────────────────────────────
+    if (
+        current.total_token_limit is not None
+        and burn_rate.tokens_per_minute > 0
+    ):
+        remaining = current.total_token_limit - current.total_tokens
+        if remaining <= 0:
+            return RuntimeUsagePredictions(time_to_limit_ms=0, safe=False, limit_kind="total_tokens")
+        candidates.append((
+            int(remaining / burn_rate.tokens_per_minute * 60_000),
+            "total_tokens",
+        ))
+
+    # ── Legacy token_limit (only when no typed token limit) ───────────────────
+    if (
+        not candidates
+        and current.token_limit is not None
+        and burn_rate.tokens_per_minute > 0
+    ):
+        remaining = current.token_limit - current.total_tokens
+        if remaining <= 0:
+            return RuntimeUsagePredictions(time_to_limit_ms=0, safe=False, limit_kind="total_tokens")
+        candidates.append((
+            int(remaining / burn_rate.tokens_per_minute * 60_000),
+            "total_tokens",
+        ))
+
+    # ── Message limit ─────────────────────────────────────────────────────────
+    if current.message_limit is not None and current.message_limit > 0:
+        window_minutes = max(window.reset_in_ms / 60_000, 1)
+        calls_per_minute = current.total_calls / window_minutes if window_minutes > 0 else 0
+        if calls_per_minute > 0:
+            remaining = current.message_limit - current.total_calls
+            if remaining <= 0:
+                return RuntimeUsagePredictions(time_to_limit_ms=0, safe=False, limit_kind="messages")
+            candidates.append((
+                int(remaining / calls_per_minute * 60_000),
+                "messages",
+            ))
+
+    if not candidates:
+        return RuntimeUsagePredictions(time_to_limit_ms=None, safe=True, limit_kind="total_tokens")
+
+    # Pick the most constrained (smallest time) — that is what will actually block work.
+    time_to_limit_ms, kind = min(candidates, key=lambda c: c[0])
    safe = time_to_limit_ms > window.reset_in_ms
-    return RuntimeUsagePredictions(time_to_limit_ms=time_to_limit_ms, safe=safe)
+    return RuntimeUsagePredictions(time_to_limit_ms=time_to_limit_ms, safe=safe, limit_kind=kind)


 # ---------------------------------------------------------------------------
--- a/frontend/src/components/dashboard/RuntimeUsageSection.tsx
+++ b/frontend/src/components/dashboard/RuntimeUsageSection.tsx
@ -16,15 +16,24 @@ import { DashboardEmptyState } from "./DashboardEmptyState";
 export interface AggregatedRuntimeUsage {
  totalCostUsd: number;
  totalTokens: number;
+  totalOutputTokens: number;
  totalCalls: number;
+  // Legacy (backwards compat)
  tokenLimit: number | null;
  tokenPct: number | null;
  costLimitUsd: number | null;
+  // Typed limits (Phase 4)
+  outputTokenLimit: number | null;
+  outputTokenLimitPct: number | null;
+  messageLimit: number | null;
+  messagePct: number | null;
+  limitKind: string;  // which limit drove the time-to-limit prediction
  resetInMs: number;
  resetsAt: string;
  timeToLimitMs: number | null;
  safe: boolean;
  tokensPerMinute: number;
+  outputTokensPerMinute: number;
  costUsdPerMinute: number;
  perModel: Record<string, ModelUsageEntry>;
  topSessions: TopSession[];
@ -54,15 +63,22 @@ export function aggregateRuntimeUsage(
    return {
      totalCostUsd: 0,
      totalTokens: 0,
+      totalOutputTokens: 0,
      totalCalls: 0,
      tokenLimit: null,
      tokenPct: null,
      costLimitUsd: null,
+      outputTokenLimit: null,
+      outputTokenLimitPct: null,
+      messageLimit: null,
+      messagePct: null,
+      limitKind: "total_tokens",
      resetInMs: 0,
      resetsAt: "",
      timeToLimitMs: null,
      safe: true,
      tokensPerMinute: 0,
+      outputTokensPerMinute: 0,
      costUsdPerMinute: 0,
      perModel: {},
      topSessions: [],
@ -71,21 +87,24 @@ export function aggregateRuntimeUsage(

  let totalCostUsd = 0;
  let totalTokens = 0;
+  let totalOutputTokens = 0;
  let totalCalls = 0;
  let tokensPerMinute = 0;
+  let outputTokensPerMinute = 0;
  let costUsdPerMinute = 0;
-  // Use the window that resets soonest as the binding constraint
  let resetInMs = valid[0].window.reset_in_ms;
  let resetsAt = valid[0].window.resets_at;
  const perModel: Record<string, ModelUsageEntry> = {};
  const allSessions: TopSession[] = [];

  for (const r of valid) {
-    totalCostUsd     += r.current.total_cost_usd;
-    totalTokens      += r.current.total_tokens;
-    totalCalls       += r.current.total_calls;
-    tokensPerMinute  += r.burn_rate.tokens_per_minute;
-    costUsdPerMinute += r.burn_rate.cost_usd_per_minute;
+    totalCostUsd          += r.current.total_cost_usd;
+    totalTokens           += r.current.total_tokens;
+    totalOutputTokens     += r.current.total_output_tokens ?? 0;
+    totalCalls            += r.current.total_calls;
+    tokensPerMinute       += r.burn_rate.tokens_per_minute;
+    outputTokensPerMinute += r.burn_rate.output_tokens_per_minute ?? 0;
+    costUsdPerMinute      += r.burn_rate.cost_usd_per_minute;

    if (r.window.reset_in_ms < resetInMs) {
      resetInMs = r.window.reset_in_ms;
@ -113,25 +132,45 @@ export function aggregateRuntimeUsage(
    allSessions.push(...r.top_sessions);
  }

-  // Aggregate token limits — only meaningful if all gateways share one limit
-  const limits = valid.map(r => r.current.token_limit).filter((v): v is number => v !== null);
+  // Legacy token limit
+  const limits    = valid.map(r => r.current.token_limit).filter((v): v is number => v !== null);
  const tokenLimit    = limits.length > 0 ? limits.reduce((a, b) => a + b, 0) : null;
  const tokenPct      = tokenLimit ? Math.min(100, Math.round(totalTokens * 100 / tokenLimit)) : null;
  const costLimits    = valid.map(r => r.current.cost_limit_usd).filter((v): v is number => v !== null);
  const costLimitUsd  = costLimits.length > 0 ? costLimits.reduce((a, b) => a + b, 0) : null;

-  // Re-derive time-to-limit from aggregated burn rate
+  // Typed limits
+  const outLimits  = valid.map(r => r.current.output_token_limit).filter((v): v is number => v !== null);
+  const outputTokenLimit    = outLimits.length > 0 ? outLimits.reduce((a, b) => a + b, 0) : null;
+  const outputTokenLimitPct = outputTokenLimit
+    ? Math.min(100, Math.round(totalOutputTokens * 100 / outputTokenLimit))
+    : null;
+
+  const msgLimits  = valid.map(r => r.current.message_limit).filter((v): v is number => v !== null);
+  const messageLimit = msgLimits.length > 0 ? msgLimits.reduce((a, b) => a + b, 0) : null;
+  const messagePct   = messageLimit ? Math.min(100, Math.round(totalCalls * 100 / messageLimit)) : null;
+
+  // Re-derive time-to-limit using the most constrained matching limit
  let timeToLimitMs: number | null = null;
  let safe = true;
+  let limitKind = "total_tokens";
+  const candidates: Array<{ ms: number; kind: string }> = [];
+
+  if (outputTokenLimit !== null && outputTokensPerMinute > 0) {
+    const rem = outputTokenLimit - totalOutputTokens;
+    if (rem <= 0) return { totalCostUsd: Math.round(totalCostUsd * 1e8) / 1e8, totalTokens, totalOutputTokens, totalCalls, tokenLimit, tokenPct, costLimitUsd, outputTokenLimit, outputTokenLimitPct, messageLimit, messagePct, limitKind: "output_tokens", resetInMs, resetsAt, timeToLimitMs: 0, safe: false, tokensPerMinute: Math.round(tokensPerMinute * 100) / 100, outputTokensPerMinute: Math.round(outputTokensPerMinute * 100) / 100, costUsdPerMinute: Math.round(costUsdPerMinute * 1e8) / 1e8, perModel, topSessions: [...allSessions].sort((a, b) => b.cost_usd - a.cost_usd).slice(0, 10) };
+    candidates.push({ ms: Math.round(rem / outputTokensPerMinute * 60_000), kind: "output_tokens" });
+  }
  if (tokenLimit !== null && tokensPerMinute > 0) {
-    const remaining = tokenLimit - totalTokens;
-    if (remaining <= 0) {
-      timeToLimitMs = 0;
-      safe = false;
-    } else {
-      timeToLimitMs = Math.round((remaining / tokensPerMinute) * 60 * 1000);
-      safe = timeToLimitMs > resetInMs;
-    }
+    const rem = tokenLimit - totalTokens;
+    if (rem <= 0) return { totalCostUsd: Math.round(totalCostUsd * 1e8) / 1e8, totalTokens, totalOutputTokens, totalCalls, tokenLimit, tokenPct, costLimitUsd, outputTokenLimit, outputTokenLimitPct, messageLimit, messagePct, limitKind: "total_tokens", resetInMs, resetsAt, timeToLimitMs: 0, safe: false, tokensPerMinute: Math.round(tokensPerMinute * 100) / 100, outputTokensPerMinute: Math.round(outputTokensPerMinute * 100) / 100, costUsdPerMinute: Math.round(costUsdPerMinute * 1e8) / 1e8, perModel, topSessions: [...allSessions].sort((a, b) => b.cost_usd - a.cost_usd).slice(0, 10) };
+    candidates.push({ ms: Math.round(rem / tokensPerMinute * 60_000), kind: "total_tokens" });
+  }
+  if (candidates.length > 0) {
+    const tightest = candidates.reduce((a, b) => a.ms < b.ms ? a : b);
+    timeToLimitMs = tightest.ms;
+    limitKind = tightest.kind;
+    safe = timeToLimitMs > resetInMs;
  }

  const topSessions = [...allSessions]
@ -141,15 +180,22 @@ export function aggregateRuntimeUsage(
  return {
    totalCostUsd: Math.round(totalCostUsd * 1e8) / 1e8,
    totalTokens,
+    totalOutputTokens,
    totalCalls,
    tokenLimit,
    tokenPct,
    costLimitUsd,
+    outputTokenLimit,
+    outputTokenLimitPct,
+    messageLimit,
+    messagePct,
+    limitKind,
    resetInMs,
    resetsAt,
    timeToLimitMs,
    safe,
    tokensPerMinute: Math.round(tokensPerMinute * 100) / 100,
+    outputTokensPerMinute: Math.round(outputTokensPerMinute * 100) / 100,
    costUsdPerMinute: Math.round(costUsdPerMinute * 1e8) / 1e8,
    perModel,
    topSessions,
@ -343,14 +389,33 @@ export function RuntimeUsageSection({
              <StatCard
                label="Time to Limit"
                value={usage.timeToLimitMs === null ? "—" : usage.timeToLimitMs === 0 ? "At limit" : fmtMs(usage.timeToLimitMs)}
-                sub={usage.tokenLimit ? `${usage.tokenPct ?? 0}% of ${fmtTokens(usage.tokenLimit)}` : undefined}
+                sub={(() => {
+                  if (usage.outputTokenLimit) {
+                    return `${usage.outputTokenLimitPct ?? 0}% of ${fmtTokens(usage.outputTokenLimit)} out`;
+                  }
+                  if (usage.tokenLimit) {
+                    return `${usage.tokenPct ?? 0}% of ${fmtTokens(usage.tokenLimit)}`;
+                  }
+                  if (usage.messageLimit) {
+                    return `${usage.messagePct ?? 0}% of ${usage.messageLimit} msgs`;
+                  }
+                  return undefined;
+                })()}
                tone={safenessTone}
                icon={<TrendingDown className="h-3 w-3" />}
              />
              <StatCard
                label="Burn Rate"
                value={usage.costUsdPerMinute > 0 ? `${fmtCost(usage.costUsdPerMinute)}/m` : "—"}
-                sub={usage.tokensPerMinute > 0 ? `${fmtTokens(usage.tokensPerMinute)} tok/m` : undefined}
+                sub={(() => {
+                  if (usage.outputTokensPerMinute > 0 && usage.outputTokenLimit) {
+                    return `${fmtTokens(usage.outputTokensPerMinute)} out-tok/m`;
+                  }
+                  if (usage.tokensPerMinute > 0) {
+                    return `${fmtTokens(usage.tokensPerMinute)} tok/m`;
+                  }
+                  return undefined;
+                })()}
                icon={<Flame className="h-3 w-3" />}
              />
            </div>
@ -374,8 +439,10 @@ export function RuntimeUsageSection({
                    <th className="px-3 py-2 text-right font-semibold text-muted">Tokens out</th>
                    <th className="px-3 py-2 text-right font-semibold text-muted">Cost</th>
                    <th className="px-3 py-2 text-right font-semibold text-muted">Calls</th>
-                    {usage.tokenLimit !== null && (
-                      <th className="px-3 py-2 text-right font-semibold text-muted">% limit</th>
+                    {(usage.outputTokenLimit !== null || usage.tokenLimit !== null) && (
+                      <th className="px-3 py-2 text-right font-semibold text-muted">
+                        {usage.outputTokenLimit !== null ? "% out limit" : "% limit"}
+                      </th>
                    )}
                  </tr>
                </thead>
@ -396,7 +463,12 @@ export function RuntimeUsageSection({
                        {entry.unpriced ? <span className="text-[color:var(--warning)]">—*</span> : fmtCost(entry.cost_usd)}
                      </td>
                      <td className="px-3 py-2 text-right tabular-nums text-muted">{entry.calls}</td>
-                      {usage.tokenLimit !== null && (
+                      {usage.outputTokenLimit !== null && (
+                        <td className="px-3 py-2 text-right tabular-nums text-muted">
+                          {Math.round(entry.output_tokens * 100 / usage.outputTokenLimit)}%
+                        </td>
+                      )}
+                      {usage.outputTokenLimit === null && usage.tokenLimit !== null && (
                        <td className="px-3 py-2 text-right tabular-nums text-muted">
                          {Math.round(entry.total_tokens * 100 / usage.tokenLimit)}%
                        </td>