feat(usage): separate limit types — typed limits for output tokens, total tokens, messages (#39)

- Add typed limit fields to RuntimeUsageCurrent: output_token_limit, total_token_limit, message_limit with matching pct and source - Add total_output_tokens and output_tokens_per_minute to burn rate - _build_current() now computes each pct from matching units only - Legacy token_limit backfilled from typed limits for backwards compat - Frontend aggregateRuntimeUsage() tracks typed limits separately - limit_kind field on predictions indicates which limit drove time-to-limit
2026-05-21 01:43:28 -05:00 · 2026-05-21 01:43:28 -05:00 · 5217a70c9f
parent 02eb03d408
commit 5217a70c9f
3 changed files with 289 additions and 79 deletions
--- a/backend/app/schemas/runtime_usage.py
+++ b/backend/app/schemas/runtime_usage.py
@ -25,21 +25,44 @@ class RuntimeUsageCurrent(SQLModel):
    """Aggregated totals within the current window."""
    total_cost_usd: float
-    total_tokens: int  # input + output across all sessions
+    total_tokens: int       # input + output across all sessions
    total_output_tokens: int = 0  # output tokens only — used with output_token_limit
    total_calls: int
-    token_limit: int | None = None  # configured limit; None = unknown
+
-    token_pct: int | None = None   # 0–100; None when limit unknown
+    # ── Legacy fields (kept for backwards compat) ────────────────────────────
    # token_limit is ambiguous (could be total or output); use typed fields below
    # when the limit kind is known.
    token_limit: int | None = None
    token_pct: int | None = None
    cost_limit_usd: float | None = None
    cost_pct: int | None = None
    # Source and confidence for the limits
    token_limit_source: str | None = None
    cost_limit_source: str | None = None
    # ── Typed limits (Phase 4) ────────────────────────────────────────────────
    # Each field pairs a limit with a percent computed from matching units only.
    # Output-token limit: compared against output tokens only, never input/cache.
    output_token_limit: int | None = None
    output_token_limit_pct: int | None = None
    output_token_limit_source: str | None = None
    # Total-token limit: compared against input + output combined.
    total_token_limit: int | None = None
    total_token_limit_pct: int | None = None
    total_token_limit_source: str | None = None
    # Message/request limit: compared against call count, never token totals.
    message_limit: int | None = None
    message_pct: int | None = None
    message_limit_source: str | None = None
 class RuntimeUsageBurnRate(SQLModel):
    """Recent token and cost velocity (last 60 minutes of the window)."""
-    tokens_per_minute: float
+    tokens_per_minute: float          # input + output combined
    output_tokens_per_minute: float = 0.0  # output tokens only
    cost_usd_per_minute: float
@ -48,6 +71,7 @@ class RuntimeUsagePredictions(SQLModel):
    time_to_limit_ms: int | None = None  # None when limit or burn rate unknown
    safe: bool  # True if time_to_limit > reset_in_ms (will reset before hitting limit)
    limit_kind: str = "total_tokens"  # which limit drove this prediction
 class ModelUsageEntry(SQLModel):
--- a/backend/app/services/openclaw/runtime_usage.py
+++ b/backend/app/services/openclaw/runtime_usage.py
@ -544,58 +544,107 @@ def _build_window(
    )
 def _limit_source(status_raw: dict[str, Any]) -> str:
    """Return the appropriate source label for a limit read from gateway status."""
    has_rate_limit_headers = (
        status_raw.get("x_ratelimit_remaining") or
        status_raw.get("x_ratelimit_limit") or
        status_raw.get("anthropic_ratelimit_remaining") or
        status_raw.get("anthropic_ratelimit_limit")
    )
    return "provider_api_rate_limit" if has_rate_limit_headers else "configured_limit"
 def _pct(numerator: int | float, denominator: int | float) -> int | None:
    if not denominator:
        return None
    return int(min(100, numerator * 100 // denominator))
 def _build_current(
    per_model: dict[str, ModelUsageEntry],
    status_raw: dict[str, Any],
    account_key: str = "default",
 ) -> RuntimeUsageCurrent:
-    total_cost  = round(sum(e.cost_usd for e in per_model.values()), 8)
+    total_cost         = round(sum(e.cost_usd for e in per_model.values()), 8)
-    total_tokens = sum(e.total_tokens for e in per_model.values())
+    total_tokens       = sum(e.total_tokens for e in per_model.values())
-    total_calls  = sum(e.calls for e in per_model.values())
+    total_output_tokens = sum(e.output_tokens for e in per_model.values())
    total_calls        = sum(e.calls for e in per_model.values())
-    # Try to get configured limits from the gateway status
+    src = _limit_source(status_raw)
    raw_token_limit = _get_int(status_raw, "tokenLimit", "token_limit", "messageLimit", "message_limit", default=0)
    token_limit = raw_token_limit or None
    # Determine source for token limit
    if raw_token_limit:
        # Check for API rate-limit headers
        has_rate_limit_headers = (
            status_raw.get("x_ratelimit_remaining") or
            status_raw.get("x_ratelimit_limit") or
            status_raw.get("anthropic_ratelimit_remaining") or
            status_raw.get("anthropic_ratelimit_limit")
        )
        if has_rate_limit_headers:
            token_limit_source = "provider_api_rate_limit"
        else:
            token_limit_source = "configured_limit"
    else:
        token_limit_source = None
    token_pct   = int(min(100, total_tokens * 100 // raw_token_limit)) if raw_token_limit else None
    # ── Explicit output-token limit ───────────────────────────────────────────
    raw_output_limit = _get_int(
        status_raw, "outputTokenLimit", "output_token_limit", default=0
    )
    output_token_limit     = raw_output_limit or None
    output_token_limit_pct = _pct(total_output_tokens, raw_output_limit)
    output_token_limit_src = src if raw_output_limit else None
    # ── Explicit total-token limit ────────────────────────────────────────────
    raw_total_limit = _get_int(
        status_raw, "totalTokenLimit", "total_token_limit", default=0
    )
    total_token_limit     = raw_total_limit or None
    total_token_limit_pct = _pct(total_tokens, raw_total_limit)
    total_token_limit_src = src if raw_total_limit else None
    # ── Message/request limit (count-based, never token-based) ───────────────
    raw_message_limit = _get_int(
        status_raw, "messageLimit", "message_limit", "requestLimit", "request_limit",
        default=0,
    )
    message_limit     = raw_message_limit or None
    message_pct       = _pct(total_calls, raw_message_limit)
    message_limit_src = src if raw_message_limit else None
    # ── Legacy token_limit (ambiguous kind — maps to tokenLimit only) ─────────
    # Do NOT fold messageLimit into this; keep units separate.
    raw_token_limit = _get_int(status_raw, "tokenLimit", "token_limit", default=0)
    token_limit     = raw_token_limit or None
    token_pct       = _pct(total_tokens, raw_token_limit)
    token_limit_src = src if raw_token_limit else None
    # If we got an explicit typed limit but no legacy one, backfill legacy
    # so existing dashboard code still works during the transition.
    if token_limit is None:
        if output_token_limit is not None:
            token_limit     = output_token_limit
            token_pct       = output_token_limit_pct
            token_limit_src = output_token_limit_src
        elif total_token_limit is not None:
            token_limit     = total_token_limit
            token_pct       = total_token_limit_pct
            token_limit_src = total_token_limit_src
    # ── Cost limit ────────────────────────────────────────────────────────────
    raw_cost_limit = _get_float(status_raw, "costLimit", "cost_limit", "costLimitUsd", default=0.0)
-    cost_limit  = raw_cost_limit or None
+    cost_limit     = raw_cost_limit or None
-    
+    cost_pct       = _pct(total_cost, raw_cost_limit) if raw_cost_limit else None
-    # Determine source for cost limit
+    cost_limit_src = src if raw_cost_limit else None
    if raw_cost_limit:
        cost_limit_source = "configured_limit"
    else:
        cost_limit_source = None
    cost_pct    = int(min(100, total_cost * 100 / raw_cost_limit)) if raw_cost_limit else None
    return RuntimeUsageCurrent(
        total_cost_usd=total_cost,
        total_tokens=total_tokens,
        total_output_tokens=total_output_tokens,
        total_calls=total_calls,
        # legacy
        token_limit=token_limit,
        token_pct=token_pct,
        cost_limit_usd=cost_limit,
        cost_pct=cost_pct,
-        token_limit_source=token_limit_source,
+        token_limit_source=token_limit_src,
-        cost_limit_source=cost_limit_source,
+        cost_limit_source=cost_limit_src,
        # typed
        output_token_limit=output_token_limit,
        output_token_limit_pct=output_token_limit_pct,
        output_token_limit_source=output_token_limit_src,
        total_token_limit=total_token_limit,
        total_token_limit_pct=total_token_limit_pct,
        total_token_limit_source=total_token_limit_src,
        message_limit=message_limit,
        message_pct=message_pct,
        message_limit_source=message_limit_src,
    )
@ -604,9 +653,14 @@ def _compute_burn_rate(
    window: RuntimeUsageWindow,
    now: datetime,
 ) -> RuntimeUsageBurnRate:
-    """Compute tokens/min and cost/min from the most recent 60 minutes of sessions."""
+    """Compute tokens/min and cost/min from the most recent 60 minutes of sessions.
    Tracks total tokens (input+output) and output tokens separately so that
    predictions against output-token limits use the correct numerator.
    """
    cutoff = now - timedelta(minutes=60)
    recent_tokens = 0
    recent_output_tokens = 0
    recent_cost   = 0.0
    for session in sessions:
@ -615,15 +669,14 @@ def _compute_burn_rate(
        if ts is None or ts < cutoff:
            continue
        tokens = _parse_session_usage(session)
-        recent_tokens += tokens["input"] + tokens["output"]
+        recent_tokens        += tokens["input"] + tokens["output"]
-        recent_cost   += _get_float(session, "cost", "cost_usd", "costUsd", default=0.0)
+        recent_output_tokens += tokens["output"]
        recent_cost          += _get_float(session, "cost", "cost_usd", "costUsd", default=0.0)
    # Rate per minute over the last 60 minutes
    tokens_per_minute  = round(recent_tokens / 60, 4)
    cost_per_minute    = round(recent_cost   / 60, 8)
    return RuntimeUsageBurnRate(
-        tokens_per_minute=tokens_per_minute,
+        tokens_per_minute=round(recent_tokens / 60, 4),
-        cost_usd_per_minute=cost_per_minute,
+        output_tokens_per_minute=round(recent_output_tokens / 60, 4),
        cost_usd_per_minute=round(recent_cost / 60, 8),
    )
@ -632,18 +685,79 @@ def _build_predictions(
    burn_rate: RuntimeUsageBurnRate,
    window: RuntimeUsageWindow,
 ) -> RuntimeUsagePredictions:
-    """Estimate time-to-limit in ms based on total-token burn rate."""
+    """Estimate time-to-limit in ms using the most constrained matching limit.
    if burn_rate.tokens_per_minute <= 0 or current.token_limit is None:
        return RuntimeUsagePredictions(time_to_limit_ms=None, safe=True)
-    tokens_remaining = current.token_limit - current.total_tokens
+    Priority order (tightest first):
-    if tokens_remaining <= 0:
+      1. output_token_limit  vs output_tokens   (burn: output_tokens_per_minute)
-        return RuntimeUsagePredictions(time_to_limit_ms=0, safe=False)
+      2. total_token_limit   vs total_tokens    (burn: tokens_per_minute)
      3. legacy token_limit  vs total_tokens    (burn: tokens_per_minute)
      4. message_limit       vs total_calls     (constant rate = calls / window_minutes)
-    minutes_to_limit  = tokens_remaining / burn_rate.tokens_per_minute
+    Cost and request limits are not used for time-to-limit since they either
-    time_to_limit_ms  = int(minutes_to_limit * 60 * 1000)
+    require billing data (cost) or are not the binding constraint in practice.
    """
    candidates: list[tuple[int, str]] = []  # (time_to_limit_ms, kind)
    # ── Output-token limit ────────────────────────────────────────────────────
    if (
        current.output_token_limit is not None
        and burn_rate.output_tokens_per_minute > 0
    ):
        remaining = current.output_token_limit - current.total_output_tokens
        if remaining <= 0:
            return RuntimeUsagePredictions(time_to_limit_ms=0, safe=False, limit_kind="output_tokens")
        candidates.append((
            int(remaining / burn_rate.output_tokens_per_minute * 60_000),
            "output_tokens",
        ))
    # ── Total-token limit ─────────────────────────────────────────────────────
    if (
        current.total_token_limit is not None
        and burn_rate.tokens_per_minute > 0
    ):
        remaining = current.total_token_limit - current.total_tokens
        if remaining <= 0:
            return RuntimeUsagePredictions(time_to_limit_ms=0, safe=False, limit_kind="total_tokens")
        candidates.append((
            int(remaining / burn_rate.tokens_per_minute * 60_000),
            "total_tokens",
        ))
    # ── Legacy token_limit (only when no typed token limit) ───────────────────
    if (
        not candidates
        and current.token_limit is not None
        and burn_rate.tokens_per_minute > 0
    ):
        remaining = current.token_limit - current.total_tokens
        if remaining <= 0:
            return RuntimeUsagePredictions(time_to_limit_ms=0, safe=False, limit_kind="total_tokens")
        candidates.append((
            int(remaining / burn_rate.tokens_per_minute * 60_000),
            "total_tokens",
        ))
    # ── Message limit ─────────────────────────────────────────────────────────
    if current.message_limit is not None and current.message_limit > 0:
        window_minutes = max(window.reset_in_ms / 60_000, 1)
        calls_per_minute = current.total_calls / window_minutes if window_minutes > 0 else 0
        if calls_per_minute > 0:
            remaining = current.message_limit - current.total_calls
            if remaining <= 0:
                return RuntimeUsagePredictions(time_to_limit_ms=0, safe=False, limit_kind="messages")
            candidates.append((
                int(remaining / calls_per_minute * 60_000),
                "messages",
            ))
    if not candidates:
        return RuntimeUsagePredictions(time_to_limit_ms=None, safe=True, limit_kind="total_tokens")
    # Pick the most constrained (smallest time) — that is what will actually block work.
    time_to_limit_ms, kind = min(candidates, key=lambda c: c[0])
    safe = time_to_limit_ms > window.reset_in_ms
-    return RuntimeUsagePredictions(time_to_limit_ms=time_to_limit_ms, safe=safe)
+    return RuntimeUsagePredictions(time_to_limit_ms=time_to_limit_ms, safe=safe, limit_kind=kind)
 # ---------------------------------------------------------------------------
--- a/frontend/src/components/dashboard/RuntimeUsageSection.tsx
+++ b/frontend/src/components/dashboard/RuntimeUsageSection.tsx
@ -16,15 +16,24 @@ import { DashboardEmptyState } from "./DashboardEmptyState";
 export interface AggregatedRuntimeUsage {
  totalCostUsd: number;
  totalTokens: number;
  totalOutputTokens: number;
  totalCalls: number;
  // Legacy (backwards compat)
  tokenLimit: number | null;
  tokenPct: number | null;
  costLimitUsd: number | null;
  // Typed limits (Phase 4)
  outputTokenLimit: number | null;
  outputTokenLimitPct: number | null;
  messageLimit: number | null;
  messagePct: number | null;
  limitKind: string;  // which limit drove the time-to-limit prediction
  resetInMs: number;
  resetsAt: string;
  timeToLimitMs: number | null;
  safe: boolean;
  tokensPerMinute: number;
  outputTokensPerMinute: number;
  costUsdPerMinute: number;
  perModel: Record<string, ModelUsageEntry>;
  topSessions: TopSession[];
@ -54,15 +63,22 @@ export function aggregateRuntimeUsage(
    return {
      totalCostUsd: 0,
      totalTokens: 0,
      totalOutputTokens: 0,
      totalCalls: 0,
      tokenLimit: null,
      tokenPct: null,
      costLimitUsd: null,
      outputTokenLimit: null,
      outputTokenLimitPct: null,
      messageLimit: null,
      messagePct: null,
      limitKind: "total_tokens",
      resetInMs: 0,
      resetsAt: "",
      timeToLimitMs: null,
      safe: true,
      tokensPerMinute: 0,
      outputTokensPerMinute: 0,
      costUsdPerMinute: 0,
      perModel: {},
      topSessions: [],
@ -71,21 +87,24 @@ export function aggregateRuntimeUsage(
  let totalCostUsd = 0;
  let totalTokens = 0;
  let totalOutputTokens = 0;
  let totalCalls = 0;
  let tokensPerMinute = 0;
  let outputTokensPerMinute = 0;
  let costUsdPerMinute = 0;
  // Use the window that resets soonest as the binding constraint
  let resetInMs = valid[0].window.reset_in_ms;
  let resetsAt = valid[0].window.resets_at;
  const perModel: Record<string, ModelUsageEntry> = {};
  const allSessions: TopSession[] = [];
  for (const r of valid) {
-    totalCostUsd     += r.current.total_cost_usd;
+    totalCostUsd          += r.current.total_cost_usd;
-    totalTokens      += r.current.total_tokens;
+    totalTokens           += r.current.total_tokens;
-    totalCalls       += r.current.total_calls;
+    totalOutputTokens     += r.current.total_output_tokens ?? 0;
-    tokensPerMinute  += r.burn_rate.tokens_per_minute;
+    totalCalls            += r.current.total_calls;
-    costUsdPerMinute += r.burn_rate.cost_usd_per_minute;
+    tokensPerMinute       += r.burn_rate.tokens_per_minute;
    outputTokensPerMinute += r.burn_rate.output_tokens_per_minute ?? 0;
    costUsdPerMinute      += r.burn_rate.cost_usd_per_minute;
    if (r.window.reset_in_ms < resetInMs) {
      resetInMs = r.window.reset_in_ms;
@ -113,25 +132,45 @@ export function aggregateRuntimeUsage(
    allSessions.push(...r.top_sessions);
  }
-  // Aggregate token limits — only meaningful if all gateways share one limit
+  // Legacy token limit
-  const limits = valid.map(r => r.current.token_limit).filter((v): v is number => v !== null);
+  const limits    = valid.map(r => r.current.token_limit).filter((v): v is number => v !== null);
  const tokenLimit    = limits.length > 0 ? limits.reduce((a, b) => a + b, 0) : null;
  const tokenPct      = tokenLimit ? Math.min(100, Math.round(totalTokens * 100 / tokenLimit)) : null;
  const costLimits    = valid.map(r => r.current.cost_limit_usd).filter((v): v is number => v !== null);
  const costLimitUsd  = costLimits.length > 0 ? costLimits.reduce((a, b) => a + b, 0) : null;
-  // Re-derive time-to-limit from aggregated burn rate
+  // Typed limits
  const outLimits  = valid.map(r => r.current.output_token_limit).filter((v): v is number => v !== null);
  const outputTokenLimit    = outLimits.length > 0 ? outLimits.reduce((a, b) => a + b, 0) : null;
  const outputTokenLimitPct = outputTokenLimit
    ? Math.min(100, Math.round(totalOutputTokens * 100 / outputTokenLimit))
    : null;
  const msgLimits  = valid.map(r => r.current.message_limit).filter((v): v is number => v !== null);
  const messageLimit = msgLimits.length > 0 ? msgLimits.reduce((a, b) => a + b, 0) : null;
  const messagePct   = messageLimit ? Math.min(100, Math.round(totalCalls * 100 / messageLimit)) : null;
  // Re-derive time-to-limit using the most constrained matching limit
  let timeToLimitMs: number | null = null;
  let safe = true;
  let limitKind = "total_tokens";
  const candidates: Array<{ ms: number; kind: string }> = [];
  if (outputTokenLimit !== null && outputTokensPerMinute > 0) {
    const rem = outputTokenLimit - totalOutputTokens;
    if (rem <= 0) return { totalCostUsd: Math.round(totalCostUsd * 1e8) / 1e8, totalTokens, totalOutputTokens, totalCalls, tokenLimit, tokenPct, costLimitUsd, outputTokenLimit, outputTokenLimitPct, messageLimit, messagePct, limitKind: "output_tokens", resetInMs, resetsAt, timeToLimitMs: 0, safe: false, tokensPerMinute: Math.round(tokensPerMinute * 100) / 100, outputTokensPerMinute: Math.round(outputTokensPerMinute * 100) / 100, costUsdPerMinute: Math.round(costUsdPerMinute * 1e8) / 1e8, perModel, topSessions: [...allSessions].sort((a, b) => b.cost_usd - a.cost_usd).slice(0, 10) };
    candidates.push({ ms: Math.round(rem / outputTokensPerMinute * 60_000), kind: "output_tokens" });
  }
  if (tokenLimit !== null && tokensPerMinute > 0) {
-    const remaining = tokenLimit - totalTokens;
+    const rem = tokenLimit - totalTokens;
-    if (remaining <= 0) {
+    if (rem <= 0) return { totalCostUsd: Math.round(totalCostUsd * 1e8) / 1e8, totalTokens, totalOutputTokens, totalCalls, tokenLimit, tokenPct, costLimitUsd, outputTokenLimit, outputTokenLimitPct, messageLimit, messagePct, limitKind: "total_tokens", resetInMs, resetsAt, timeToLimitMs: 0, safe: false, tokensPerMinute: Math.round(tokensPerMinute * 100) / 100, outputTokensPerMinute: Math.round(outputTokensPerMinute * 100) / 100, costUsdPerMinute: Math.round(costUsdPerMinute * 1e8) / 1e8, perModel, topSessions: [...allSessions].sort((a, b) => b.cost_usd - a.cost_usd).slice(0, 10) };
-      timeToLimitMs = 0;
+    candidates.push({ ms: Math.round(rem / tokensPerMinute * 60_000), kind: "total_tokens" });
-      safe = false;
+  }
-    } else {
+  if (candidates.length > 0) {
-      timeToLimitMs = Math.round((remaining / tokensPerMinute) * 60 * 1000);
+    const tightest = candidates.reduce((a, b) => a.ms < b.ms ? a : b);
-      safe = timeToLimitMs > resetInMs;
+    timeToLimitMs = tightest.ms;
-    }
+    limitKind = tightest.kind;
    safe = timeToLimitMs > resetInMs;
  }
  const topSessions = [...allSessions]
@ -141,15 +180,22 @@ export function aggregateRuntimeUsage(
  return {
    totalCostUsd: Math.round(totalCostUsd * 1e8) / 1e8,
    totalTokens,
    totalOutputTokens,
    totalCalls,
    tokenLimit,
    tokenPct,
    costLimitUsd,
    outputTokenLimit,
    outputTokenLimitPct,
    messageLimit,
    messagePct,
    limitKind,
    resetInMs,
    resetsAt,
    timeToLimitMs,
    safe,
    tokensPerMinute: Math.round(tokensPerMinute * 100) / 100,
    outputTokensPerMinute: Math.round(outputTokensPerMinute * 100) / 100,
    costUsdPerMinute: Math.round(costUsdPerMinute * 1e8) / 1e8,
    perModel,
    topSessions,
@ -343,14 +389,33 @@ export function RuntimeUsageSection({
              <StatCard
                label="Time to Limit"
                value={usage.timeToLimitMs === null ? "—" : usage.timeToLimitMs === 0 ? "At limit" : fmtMs(usage.timeToLimitMs)}
-                sub={usage.tokenLimit ? `${usage.tokenPct ?? 0}% of ${fmtTokens(usage.tokenLimit)}` : undefined}
+                sub={(() => {
                  if (usage.outputTokenLimit) {
                    return `${usage.outputTokenLimitPct ?? 0}% of ${fmtTokens(usage.outputTokenLimit)} out`;
                  }
                  if (usage.tokenLimit) {
                    return `${usage.tokenPct ?? 0}% of ${fmtTokens(usage.tokenLimit)}`;
                  }
                  if (usage.messageLimit) {
                    return `${usage.messagePct ?? 0}% of ${usage.messageLimit} msgs`;
                  }
                  return undefined;
                })()}
                tone={safenessTone}
                icon={<TrendingDown className="h-3 w-3" />}
              />
              <StatCard
                label="Burn Rate"
                value={usage.costUsdPerMinute > 0 ? `${fmtCost(usage.costUsdPerMinute)}/m` : "—"}
-                sub={usage.tokensPerMinute > 0 ? `${fmtTokens(usage.tokensPerMinute)} tok/m` : undefined}
+                sub={(() => {
                  if (usage.outputTokensPerMinute > 0 && usage.outputTokenLimit) {
                    return `${fmtTokens(usage.outputTokensPerMinute)} out-tok/m`;
                  }
                  if (usage.tokensPerMinute > 0) {
                    return `${fmtTokens(usage.tokensPerMinute)} tok/m`;
                  }
                  return undefined;
                })()}
                icon={<Flame className="h-3 w-3" />}
              />
            </div>
@ -374,8 +439,10 @@ export function RuntimeUsageSection({
                    <th className="px-3 py-2 text-right font-semibold text-muted">Tokens out</th>
                    <th className="px-3 py-2 text-right font-semibold text-muted">Cost</th>
                    <th className="px-3 py-2 text-right font-semibold text-muted">Calls</th>
-                    {usage.tokenLimit !== null && (
+                    {(usage.outputTokenLimit !== null || usage.tokenLimit !== null) && (
-                      <th className="px-3 py-2 text-right font-semibold text-muted">% limit</th>
+                      <th className="px-3 py-2 text-right font-semibold text-muted">
                        {usage.outputTokenLimit !== null ? "% out limit" : "% limit"}
                      </th>
                    )}
                  </tr>
                </thead>
@ -396,7 +463,12 @@ export function RuntimeUsageSection({
                        {entry.unpriced ? <span className="text-[color:var(--warning)]">—*</span> : fmtCost(entry.cost_usd)}
                      </td>
                      <td className="px-3 py-2 text-right tabular-nums text-muted">{entry.calls}</td>
-                      {usage.tokenLimit !== null && (
+                      {usage.outputTokenLimit !== null && (
                        <td className="px-3 py-2 text-right tabular-nums text-muted">
                          {Math.round(entry.output_tokens * 100 / usage.outputTokenLimit)}%
                        </td>
                      )}
                      {usage.outputTokenLimit === null && usage.tokenLimit !== null && (
                        <td className="px-3 py-2 text-right tabular-nums text-muted">
                          {Math.round(entry.total_tokens * 100 / usage.tokenLimit)}%
                        </td>