From 5217a70c9f595b83cc68d7743f4fe8ba36d06827 Mon Sep 17 00:00:00 2001 From: null Date: Thu, 21 May 2026 01:43:28 -0500 Subject: [PATCH] =?UTF-8?q?feat(usage):=20separate=20limit=20types=20?= =?UTF-8?q?=E2=80=94=20typed=20limits=20for=20output=20tokens,=20total=20t?= =?UTF-8?q?okens,=20messages=20(#39)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add typed limit fields to RuntimeUsageCurrent: output_token_limit, total_token_limit, message_limit with matching pct and source - Add total_output_tokens and output_tokens_per_minute to burn rate - _build_current() now computes each pct from matching units only - Legacy token_limit backfilled from typed limits for backwards compat - Frontend aggregateRuntimeUsage() tracks typed limits separately - limit_kind field on predictions indicates which limit drove time-to-limit --- backend/app/schemas/runtime_usage.py | 34 ++- .../app/services/openclaw/runtime_usage.py | 218 +++++++++++++----- .../dashboard/RuntimeUsageSection.tsx | 116 ++++++++-- 3 files changed, 289 insertions(+), 79 deletions(-) diff --git a/backend/app/schemas/runtime_usage.py b/backend/app/schemas/runtime_usage.py index 7026da4..3c2c4c6 100644 --- a/backend/app/schemas/runtime_usage.py +++ b/backend/app/schemas/runtime_usage.py @@ -25,21 +25,44 @@ class RuntimeUsageCurrent(SQLModel): """Aggregated totals within the current window.""" total_cost_usd: float - total_tokens: int # input + output across all sessions + total_tokens: int # input + output across all sessions + total_output_tokens: int = 0 # output tokens only — used with output_token_limit total_calls: int - token_limit: int | None = None # configured limit; None = unknown - token_pct: int | None = None # 0–100; None when limit unknown + + # ── Legacy fields (kept for backwards compat) ──────────────────────────── + # token_limit is ambiguous (could be total or output); use typed fields below + # when the limit kind is known. + token_limit: int | None = None + token_pct: int | None = None cost_limit_usd: float | None = None cost_pct: int | None = None - # Source and confidence for the limits token_limit_source: str | None = None cost_limit_source: str | None = None + # ── Typed limits (Phase 4) ──────────────────────────────────────────────── + # Each field pairs a limit with a percent computed from matching units only. + + # Output-token limit: compared against output tokens only, never input/cache. + output_token_limit: int | None = None + output_token_limit_pct: int | None = None + output_token_limit_source: str | None = None + + # Total-token limit: compared against input + output combined. + total_token_limit: int | None = None + total_token_limit_pct: int | None = None + total_token_limit_source: str | None = None + + # Message/request limit: compared against call count, never token totals. + message_limit: int | None = None + message_pct: int | None = None + message_limit_source: str | None = None + class RuntimeUsageBurnRate(SQLModel): """Recent token and cost velocity (last 60 minutes of the window).""" - tokens_per_minute: float + tokens_per_minute: float # input + output combined + output_tokens_per_minute: float = 0.0 # output tokens only cost_usd_per_minute: float @@ -48,6 +71,7 @@ class RuntimeUsagePredictions(SQLModel): time_to_limit_ms: int | None = None # None when limit or burn rate unknown safe: bool # True if time_to_limit > reset_in_ms (will reset before hitting limit) + limit_kind: str = "total_tokens" # which limit drove this prediction class ModelUsageEntry(SQLModel): diff --git a/backend/app/services/openclaw/runtime_usage.py b/backend/app/services/openclaw/runtime_usage.py index 851a4a2..08b30f7 100644 --- a/backend/app/services/openclaw/runtime_usage.py +++ b/backend/app/services/openclaw/runtime_usage.py @@ -544,58 +544,107 @@ def _build_window( ) +def _limit_source(status_raw: dict[str, Any]) -> str: + """Return the appropriate source label for a limit read from gateway status.""" + has_rate_limit_headers = ( + status_raw.get("x_ratelimit_remaining") or + status_raw.get("x_ratelimit_limit") or + status_raw.get("anthropic_ratelimit_remaining") or + status_raw.get("anthropic_ratelimit_limit") + ) + return "provider_api_rate_limit" if has_rate_limit_headers else "configured_limit" + + +def _pct(numerator: int | float, denominator: int | float) -> int | None: + if not denominator: + return None + return int(min(100, numerator * 100 // denominator)) + + def _build_current( per_model: dict[str, ModelUsageEntry], status_raw: dict[str, Any], account_key: str = "default", ) -> RuntimeUsageCurrent: - total_cost = round(sum(e.cost_usd for e in per_model.values()), 8) - total_tokens = sum(e.total_tokens for e in per_model.values()) - total_calls = sum(e.calls for e in per_model.values()) + total_cost = round(sum(e.cost_usd for e in per_model.values()), 8) + total_tokens = sum(e.total_tokens for e in per_model.values()) + total_output_tokens = sum(e.output_tokens for e in per_model.values()) + total_calls = sum(e.calls for e in per_model.values()) - # Try to get configured limits from the gateway status - raw_token_limit = _get_int(status_raw, "tokenLimit", "token_limit", "messageLimit", "message_limit", default=0) - token_limit = raw_token_limit or None - - # Determine source for token limit - if raw_token_limit: - # Check for API rate-limit headers - has_rate_limit_headers = ( - status_raw.get("x_ratelimit_remaining") or - status_raw.get("x_ratelimit_limit") or - status_raw.get("anthropic_ratelimit_remaining") or - status_raw.get("anthropic_ratelimit_limit") - ) - if has_rate_limit_headers: - token_limit_source = "provider_api_rate_limit" - else: - token_limit_source = "configured_limit" - else: - token_limit_source = None - - token_pct = int(min(100, total_tokens * 100 // raw_token_limit)) if raw_token_limit else None + src = _limit_source(status_raw) + # ── Explicit output-token limit ─────────────────────────────────────────── + raw_output_limit = _get_int( + status_raw, "outputTokenLimit", "output_token_limit", default=0 + ) + output_token_limit = raw_output_limit or None + output_token_limit_pct = _pct(total_output_tokens, raw_output_limit) + output_token_limit_src = src if raw_output_limit else None + + # ── Explicit total-token limit ──────────────────────────────────────────── + raw_total_limit = _get_int( + status_raw, "totalTokenLimit", "total_token_limit", default=0 + ) + total_token_limit = raw_total_limit or None + total_token_limit_pct = _pct(total_tokens, raw_total_limit) + total_token_limit_src = src if raw_total_limit else None + + # ── Message/request limit (count-based, never token-based) ─────────────── + raw_message_limit = _get_int( + status_raw, "messageLimit", "message_limit", "requestLimit", "request_limit", + default=0, + ) + message_limit = raw_message_limit or None + message_pct = _pct(total_calls, raw_message_limit) + message_limit_src = src if raw_message_limit else None + + # ── Legacy token_limit (ambiguous kind — maps to tokenLimit only) ───────── + # Do NOT fold messageLimit into this; keep units separate. + raw_token_limit = _get_int(status_raw, "tokenLimit", "token_limit", default=0) + token_limit = raw_token_limit or None + token_pct = _pct(total_tokens, raw_token_limit) + token_limit_src = src if raw_token_limit else None + + # If we got an explicit typed limit but no legacy one, backfill legacy + # so existing dashboard code still works during the transition. + if token_limit is None: + if output_token_limit is not None: + token_limit = output_token_limit + token_pct = output_token_limit_pct + token_limit_src = output_token_limit_src + elif total_token_limit is not None: + token_limit = total_token_limit + token_pct = total_token_limit_pct + token_limit_src = total_token_limit_src + + # ── Cost limit ──────────────────────────────────────────────────────────── raw_cost_limit = _get_float(status_raw, "costLimit", "cost_limit", "costLimitUsd", default=0.0) - cost_limit = raw_cost_limit or None - - # Determine source for cost limit - if raw_cost_limit: - cost_limit_source = "configured_limit" - else: - cost_limit_source = None - - cost_pct = int(min(100, total_cost * 100 / raw_cost_limit)) if raw_cost_limit else None + cost_limit = raw_cost_limit or None + cost_pct = _pct(total_cost, raw_cost_limit) if raw_cost_limit else None + cost_limit_src = src if raw_cost_limit else None return RuntimeUsageCurrent( total_cost_usd=total_cost, total_tokens=total_tokens, + total_output_tokens=total_output_tokens, total_calls=total_calls, + # legacy token_limit=token_limit, token_pct=token_pct, cost_limit_usd=cost_limit, cost_pct=cost_pct, - token_limit_source=token_limit_source, - cost_limit_source=cost_limit_source, + token_limit_source=token_limit_src, + cost_limit_source=cost_limit_src, + # typed + output_token_limit=output_token_limit, + output_token_limit_pct=output_token_limit_pct, + output_token_limit_source=output_token_limit_src, + total_token_limit=total_token_limit, + total_token_limit_pct=total_token_limit_pct, + total_token_limit_source=total_token_limit_src, + message_limit=message_limit, + message_pct=message_pct, + message_limit_source=message_limit_src, ) @@ -604,9 +653,14 @@ def _compute_burn_rate( window: RuntimeUsageWindow, now: datetime, ) -> RuntimeUsageBurnRate: - """Compute tokens/min and cost/min from the most recent 60 minutes of sessions.""" + """Compute tokens/min and cost/min from the most recent 60 minutes of sessions. + + Tracks total tokens (input+output) and output tokens separately so that + predictions against output-token limits use the correct numerator. + """ cutoff = now - timedelta(minutes=60) recent_tokens = 0 + recent_output_tokens = 0 recent_cost = 0.0 for session in sessions: @@ -615,15 +669,14 @@ def _compute_burn_rate( if ts is None or ts < cutoff: continue tokens = _parse_session_usage(session) - recent_tokens += tokens["input"] + tokens["output"] - recent_cost += _get_float(session, "cost", "cost_usd", "costUsd", default=0.0) + recent_tokens += tokens["input"] + tokens["output"] + recent_output_tokens += tokens["output"] + recent_cost += _get_float(session, "cost", "cost_usd", "costUsd", default=0.0) - # Rate per minute over the last 60 minutes - tokens_per_minute = round(recent_tokens / 60, 4) - cost_per_minute = round(recent_cost / 60, 8) return RuntimeUsageBurnRate( - tokens_per_minute=tokens_per_minute, - cost_usd_per_minute=cost_per_minute, + tokens_per_minute=round(recent_tokens / 60, 4), + output_tokens_per_minute=round(recent_output_tokens / 60, 4), + cost_usd_per_minute=round(recent_cost / 60, 8), ) @@ -632,18 +685,79 @@ def _build_predictions( burn_rate: RuntimeUsageBurnRate, window: RuntimeUsageWindow, ) -> RuntimeUsagePredictions: - """Estimate time-to-limit in ms based on total-token burn rate.""" - if burn_rate.tokens_per_minute <= 0 or current.token_limit is None: - return RuntimeUsagePredictions(time_to_limit_ms=None, safe=True) + """Estimate time-to-limit in ms using the most constrained matching limit. - tokens_remaining = current.token_limit - current.total_tokens - if tokens_remaining <= 0: - return RuntimeUsagePredictions(time_to_limit_ms=0, safe=False) + Priority order (tightest first): + 1. output_token_limit vs output_tokens (burn: output_tokens_per_minute) + 2. total_token_limit vs total_tokens (burn: tokens_per_minute) + 3. legacy token_limit vs total_tokens (burn: tokens_per_minute) + 4. message_limit vs total_calls (constant rate = calls / window_minutes) - minutes_to_limit = tokens_remaining / burn_rate.tokens_per_minute - time_to_limit_ms = int(minutes_to_limit * 60 * 1000) + Cost and request limits are not used for time-to-limit since they either + require billing data (cost) or are not the binding constraint in practice. + """ + candidates: list[tuple[int, str]] = [] # (time_to_limit_ms, kind) + + # ── Output-token limit ──────────────────────────────────────────────────── + if ( + current.output_token_limit is not None + and burn_rate.output_tokens_per_minute > 0 + ): + remaining = current.output_token_limit - current.total_output_tokens + if remaining <= 0: + return RuntimeUsagePredictions(time_to_limit_ms=0, safe=False, limit_kind="output_tokens") + candidates.append(( + int(remaining / burn_rate.output_tokens_per_minute * 60_000), + "output_tokens", + )) + + # ── Total-token limit ───────────────────────────────────────────────────── + if ( + current.total_token_limit is not None + and burn_rate.tokens_per_minute > 0 + ): + remaining = current.total_token_limit - current.total_tokens + if remaining <= 0: + return RuntimeUsagePredictions(time_to_limit_ms=0, safe=False, limit_kind="total_tokens") + candidates.append(( + int(remaining / burn_rate.tokens_per_minute * 60_000), + "total_tokens", + )) + + # ── Legacy token_limit (only when no typed token limit) ─────────────────── + if ( + not candidates + and current.token_limit is not None + and burn_rate.tokens_per_minute > 0 + ): + remaining = current.token_limit - current.total_tokens + if remaining <= 0: + return RuntimeUsagePredictions(time_to_limit_ms=0, safe=False, limit_kind="total_tokens") + candidates.append(( + int(remaining / burn_rate.tokens_per_minute * 60_000), + "total_tokens", + )) + + # ── Message limit ───────────────────────────────────────────────────────── + if current.message_limit is not None and current.message_limit > 0: + window_minutes = max(window.reset_in_ms / 60_000, 1) + calls_per_minute = current.total_calls / window_minutes if window_minutes > 0 else 0 + if calls_per_minute > 0: + remaining = current.message_limit - current.total_calls + if remaining <= 0: + return RuntimeUsagePredictions(time_to_limit_ms=0, safe=False, limit_kind="messages") + candidates.append(( + int(remaining / calls_per_minute * 60_000), + "messages", + )) + + if not candidates: + return RuntimeUsagePredictions(time_to_limit_ms=None, safe=True, limit_kind="total_tokens") + + # Pick the most constrained (smallest time) — that is what will actually block work. + time_to_limit_ms, kind = min(candidates, key=lambda c: c[0]) safe = time_to_limit_ms > window.reset_in_ms - return RuntimeUsagePredictions(time_to_limit_ms=time_to_limit_ms, safe=safe) + return RuntimeUsagePredictions(time_to_limit_ms=time_to_limit_ms, safe=safe, limit_kind=kind) # --------------------------------------------------------------------------- diff --git a/frontend/src/components/dashboard/RuntimeUsageSection.tsx b/frontend/src/components/dashboard/RuntimeUsageSection.tsx index a39febd..0b29294 100644 --- a/frontend/src/components/dashboard/RuntimeUsageSection.tsx +++ b/frontend/src/components/dashboard/RuntimeUsageSection.tsx @@ -16,15 +16,24 @@ import { DashboardEmptyState } from "./DashboardEmptyState"; export interface AggregatedRuntimeUsage { totalCostUsd: number; totalTokens: number; + totalOutputTokens: number; totalCalls: number; + // Legacy (backwards compat) tokenLimit: number | null; tokenPct: number | null; costLimitUsd: number | null; + // Typed limits (Phase 4) + outputTokenLimit: number | null; + outputTokenLimitPct: number | null; + messageLimit: number | null; + messagePct: number | null; + limitKind: string; // which limit drove the time-to-limit prediction resetInMs: number; resetsAt: string; timeToLimitMs: number | null; safe: boolean; tokensPerMinute: number; + outputTokensPerMinute: number; costUsdPerMinute: number; perModel: Record; topSessions: TopSession[]; @@ -54,15 +63,22 @@ export function aggregateRuntimeUsage( return { totalCostUsd: 0, totalTokens: 0, + totalOutputTokens: 0, totalCalls: 0, tokenLimit: null, tokenPct: null, costLimitUsd: null, + outputTokenLimit: null, + outputTokenLimitPct: null, + messageLimit: null, + messagePct: null, + limitKind: "total_tokens", resetInMs: 0, resetsAt: "", timeToLimitMs: null, safe: true, tokensPerMinute: 0, + outputTokensPerMinute: 0, costUsdPerMinute: 0, perModel: {}, topSessions: [], @@ -71,21 +87,24 @@ export function aggregateRuntimeUsage( let totalCostUsd = 0; let totalTokens = 0; + let totalOutputTokens = 0; let totalCalls = 0; let tokensPerMinute = 0; + let outputTokensPerMinute = 0; let costUsdPerMinute = 0; - // Use the window that resets soonest as the binding constraint let resetInMs = valid[0].window.reset_in_ms; let resetsAt = valid[0].window.resets_at; const perModel: Record = {}; const allSessions: TopSession[] = []; for (const r of valid) { - totalCostUsd += r.current.total_cost_usd; - totalTokens += r.current.total_tokens; - totalCalls += r.current.total_calls; - tokensPerMinute += r.burn_rate.tokens_per_minute; - costUsdPerMinute += r.burn_rate.cost_usd_per_minute; + totalCostUsd += r.current.total_cost_usd; + totalTokens += r.current.total_tokens; + totalOutputTokens += r.current.total_output_tokens ?? 0; + totalCalls += r.current.total_calls; + tokensPerMinute += r.burn_rate.tokens_per_minute; + outputTokensPerMinute += r.burn_rate.output_tokens_per_minute ?? 0; + costUsdPerMinute += r.burn_rate.cost_usd_per_minute; if (r.window.reset_in_ms < resetInMs) { resetInMs = r.window.reset_in_ms; @@ -113,25 +132,45 @@ export function aggregateRuntimeUsage( allSessions.push(...r.top_sessions); } - // Aggregate token limits — only meaningful if all gateways share one limit - const limits = valid.map(r => r.current.token_limit).filter((v): v is number => v !== null); + // Legacy token limit + const limits = valid.map(r => r.current.token_limit).filter((v): v is number => v !== null); const tokenLimit = limits.length > 0 ? limits.reduce((a, b) => a + b, 0) : null; const tokenPct = tokenLimit ? Math.min(100, Math.round(totalTokens * 100 / tokenLimit)) : null; const costLimits = valid.map(r => r.current.cost_limit_usd).filter((v): v is number => v !== null); const costLimitUsd = costLimits.length > 0 ? costLimits.reduce((a, b) => a + b, 0) : null; - // Re-derive time-to-limit from aggregated burn rate + // Typed limits + const outLimits = valid.map(r => r.current.output_token_limit).filter((v): v is number => v !== null); + const outputTokenLimit = outLimits.length > 0 ? outLimits.reduce((a, b) => a + b, 0) : null; + const outputTokenLimitPct = outputTokenLimit + ? Math.min(100, Math.round(totalOutputTokens * 100 / outputTokenLimit)) + : null; + + const msgLimits = valid.map(r => r.current.message_limit).filter((v): v is number => v !== null); + const messageLimit = msgLimits.length > 0 ? msgLimits.reduce((a, b) => a + b, 0) : null; + const messagePct = messageLimit ? Math.min(100, Math.round(totalCalls * 100 / messageLimit)) : null; + + // Re-derive time-to-limit using the most constrained matching limit let timeToLimitMs: number | null = null; let safe = true; + let limitKind = "total_tokens"; + const candidates: Array<{ ms: number; kind: string }> = []; + + if (outputTokenLimit !== null && outputTokensPerMinute > 0) { + const rem = outputTokenLimit - totalOutputTokens; + if (rem <= 0) return { totalCostUsd: Math.round(totalCostUsd * 1e8) / 1e8, totalTokens, totalOutputTokens, totalCalls, tokenLimit, tokenPct, costLimitUsd, outputTokenLimit, outputTokenLimitPct, messageLimit, messagePct, limitKind: "output_tokens", resetInMs, resetsAt, timeToLimitMs: 0, safe: false, tokensPerMinute: Math.round(tokensPerMinute * 100) / 100, outputTokensPerMinute: Math.round(outputTokensPerMinute * 100) / 100, costUsdPerMinute: Math.round(costUsdPerMinute * 1e8) / 1e8, perModel, topSessions: [...allSessions].sort((a, b) => b.cost_usd - a.cost_usd).slice(0, 10) }; + candidates.push({ ms: Math.round(rem / outputTokensPerMinute * 60_000), kind: "output_tokens" }); + } if (tokenLimit !== null && tokensPerMinute > 0) { - const remaining = tokenLimit - totalTokens; - if (remaining <= 0) { - timeToLimitMs = 0; - safe = false; - } else { - timeToLimitMs = Math.round((remaining / tokensPerMinute) * 60 * 1000); - safe = timeToLimitMs > resetInMs; - } + const rem = tokenLimit - totalTokens; + if (rem <= 0) return { totalCostUsd: Math.round(totalCostUsd * 1e8) / 1e8, totalTokens, totalOutputTokens, totalCalls, tokenLimit, tokenPct, costLimitUsd, outputTokenLimit, outputTokenLimitPct, messageLimit, messagePct, limitKind: "total_tokens", resetInMs, resetsAt, timeToLimitMs: 0, safe: false, tokensPerMinute: Math.round(tokensPerMinute * 100) / 100, outputTokensPerMinute: Math.round(outputTokensPerMinute * 100) / 100, costUsdPerMinute: Math.round(costUsdPerMinute * 1e8) / 1e8, perModel, topSessions: [...allSessions].sort((a, b) => b.cost_usd - a.cost_usd).slice(0, 10) }; + candidates.push({ ms: Math.round(rem / tokensPerMinute * 60_000), kind: "total_tokens" }); + } + if (candidates.length > 0) { + const tightest = candidates.reduce((a, b) => a.ms < b.ms ? a : b); + timeToLimitMs = tightest.ms; + limitKind = tightest.kind; + safe = timeToLimitMs > resetInMs; } const topSessions = [...allSessions] @@ -141,15 +180,22 @@ export function aggregateRuntimeUsage( return { totalCostUsd: Math.round(totalCostUsd * 1e8) / 1e8, totalTokens, + totalOutputTokens, totalCalls, tokenLimit, tokenPct, costLimitUsd, + outputTokenLimit, + outputTokenLimitPct, + messageLimit, + messagePct, + limitKind, resetInMs, resetsAt, timeToLimitMs, safe, tokensPerMinute: Math.round(tokensPerMinute * 100) / 100, + outputTokensPerMinute: Math.round(outputTokensPerMinute * 100) / 100, costUsdPerMinute: Math.round(costUsdPerMinute * 1e8) / 1e8, perModel, topSessions, @@ -343,14 +389,33 @@ export function RuntimeUsageSection({ { + if (usage.outputTokenLimit) { + return `${usage.outputTokenLimitPct ?? 0}% of ${fmtTokens(usage.outputTokenLimit)} out`; + } + if (usage.tokenLimit) { + return `${usage.tokenPct ?? 0}% of ${fmtTokens(usage.tokenLimit)}`; + } + if (usage.messageLimit) { + return `${usage.messagePct ?? 0}% of ${usage.messageLimit} msgs`; + } + return undefined; + })()} tone={safenessTone} icon={} /> 0 ? `${fmtCost(usage.costUsdPerMinute)}/m` : "—"} - sub={usage.tokensPerMinute > 0 ? `${fmtTokens(usage.tokensPerMinute)} tok/m` : undefined} + sub={(() => { + if (usage.outputTokensPerMinute > 0 && usage.outputTokenLimit) { + return `${fmtTokens(usage.outputTokensPerMinute)} out-tok/m`; + } + if (usage.tokensPerMinute > 0) { + return `${fmtTokens(usage.tokensPerMinute)} tok/m`; + } + return undefined; + })()} icon={} /> @@ -374,8 +439,10 @@ export function RuntimeUsageSection({ Tokens out Cost Calls - {usage.tokenLimit !== null && ( - % limit + {(usage.outputTokenLimit !== null || usage.tokenLimit !== null) && ( + + {usage.outputTokenLimit !== null ? "% out limit" : "% limit"} + )} @@ -396,7 +463,12 @@ export function RuntimeUsageSection({ {entry.unpriced ? —* : fmtCost(entry.cost_usd)} {entry.calls} - {usage.tokenLimit !== null && ( + {usage.outputTokenLimit !== null && ( + + {Math.round(entry.output_tokens * 100 / usage.outputTokenLimit)}% + + )} + {usage.outputTokenLimit === null && usage.tokenLimit !== null && ( {Math.round(entry.total_tokens * 100 / usage.tokenLimit)}%