feat(usage): separate limit types — typed limits for output tokens, total tokens, messages (#39)

- Add typed limit fields to RuntimeUsageCurrent: output_token_limit,
  total_token_limit, message_limit with matching pct and source
- Add total_output_tokens and output_tokens_per_minute to burn rate
- _build_current() now computes each pct from matching units only
- Legacy token_limit backfilled from typed limits for backwards compat
- Frontend aggregateRuntimeUsage() tracks typed limits separately
- limit_kind field on predictions indicates which limit drove time-to-limit
This commit is contained in:
null 2026-05-21 01:43:28 -05:00
parent 02eb03d408
commit 5217a70c9f
3 changed files with 289 additions and 79 deletions

View File

@ -25,21 +25,44 @@ class RuntimeUsageCurrent(SQLModel):
"""Aggregated totals within the current window."""
total_cost_usd: float
total_tokens: int # input + output across all sessions
total_tokens: int # input + output across all sessions
total_output_tokens: int = 0 # output tokens only — used with output_token_limit
total_calls: int
token_limit: int | None = None # configured limit; None = unknown
token_pct: int | None = None # 0100; None when limit unknown
# ── Legacy fields (kept for backwards compat) ────────────────────────────
# token_limit is ambiguous (could be total or output); use typed fields below
# when the limit kind is known.
token_limit: int | None = None
token_pct: int | None = None
cost_limit_usd: float | None = None
cost_pct: int | None = None
# Source and confidence for the limits
token_limit_source: str | None = None
cost_limit_source: str | None = None
# ── Typed limits (Phase 4) ────────────────────────────────────────────────
# Each field pairs a limit with a percent computed from matching units only.
# Output-token limit: compared against output tokens only, never input/cache.
output_token_limit: int | None = None
output_token_limit_pct: int | None = None
output_token_limit_source: str | None = None
# Total-token limit: compared against input + output combined.
total_token_limit: int | None = None
total_token_limit_pct: int | None = None
total_token_limit_source: str | None = None
# Message/request limit: compared against call count, never token totals.
message_limit: int | None = None
message_pct: int | None = None
message_limit_source: str | None = None
class RuntimeUsageBurnRate(SQLModel):
"""Recent token and cost velocity (last 60 minutes of the window)."""
tokens_per_minute: float
tokens_per_minute: float # input + output combined
output_tokens_per_minute: float = 0.0 # output tokens only
cost_usd_per_minute: float
@ -48,6 +71,7 @@ class RuntimeUsagePredictions(SQLModel):
time_to_limit_ms: int | None = None # None when limit or burn rate unknown
safe: bool # True if time_to_limit > reset_in_ms (will reset before hitting limit)
limit_kind: str = "total_tokens" # which limit drove this prediction
class ModelUsageEntry(SQLModel):

View File

@ -544,58 +544,107 @@ def _build_window(
)
def _limit_source(status_raw: dict[str, Any]) -> str:
"""Return the appropriate source label for a limit read from gateway status."""
has_rate_limit_headers = (
status_raw.get("x_ratelimit_remaining") or
status_raw.get("x_ratelimit_limit") or
status_raw.get("anthropic_ratelimit_remaining") or
status_raw.get("anthropic_ratelimit_limit")
)
return "provider_api_rate_limit" if has_rate_limit_headers else "configured_limit"
def _pct(numerator: int | float, denominator: int | float) -> int | None:
if not denominator:
return None
return int(min(100, numerator * 100 // denominator))
def _build_current(
per_model: dict[str, ModelUsageEntry],
status_raw: dict[str, Any],
account_key: str = "default",
) -> RuntimeUsageCurrent:
total_cost = round(sum(e.cost_usd for e in per_model.values()), 8)
total_tokens = sum(e.total_tokens for e in per_model.values())
total_calls = sum(e.calls for e in per_model.values())
total_cost = round(sum(e.cost_usd for e in per_model.values()), 8)
total_tokens = sum(e.total_tokens for e in per_model.values())
total_output_tokens = sum(e.output_tokens for e in per_model.values())
total_calls = sum(e.calls for e in per_model.values())
# Try to get configured limits from the gateway status
raw_token_limit = _get_int(status_raw, "tokenLimit", "token_limit", "messageLimit", "message_limit", default=0)
token_limit = raw_token_limit or None
src = _limit_source(status_raw)
# Determine source for token limit
if raw_token_limit:
# Check for API rate-limit headers
has_rate_limit_headers = (
status_raw.get("x_ratelimit_remaining") or
status_raw.get("x_ratelimit_limit") or
status_raw.get("anthropic_ratelimit_remaining") or
status_raw.get("anthropic_ratelimit_limit")
)
if has_rate_limit_headers:
token_limit_source = "provider_api_rate_limit"
else:
token_limit_source = "configured_limit"
else:
token_limit_source = None
# ── Explicit output-token limit ───────────────────────────────────────────
raw_output_limit = _get_int(
status_raw, "outputTokenLimit", "output_token_limit", default=0
)
output_token_limit = raw_output_limit or None
output_token_limit_pct = _pct(total_output_tokens, raw_output_limit)
output_token_limit_src = src if raw_output_limit else None
token_pct = int(min(100, total_tokens * 100 // raw_token_limit)) if raw_token_limit else None
# ── Explicit total-token limit ────────────────────────────────────────────
raw_total_limit = _get_int(
status_raw, "totalTokenLimit", "total_token_limit", default=0
)
total_token_limit = raw_total_limit or None
total_token_limit_pct = _pct(total_tokens, raw_total_limit)
total_token_limit_src = src if raw_total_limit else None
# ── Message/request limit (count-based, never token-based) ───────────────
raw_message_limit = _get_int(
status_raw, "messageLimit", "message_limit", "requestLimit", "request_limit",
default=0,
)
message_limit = raw_message_limit or None
message_pct = _pct(total_calls, raw_message_limit)
message_limit_src = src if raw_message_limit else None
# ── Legacy token_limit (ambiguous kind — maps to tokenLimit only) ─────────
# Do NOT fold messageLimit into this; keep units separate.
raw_token_limit = _get_int(status_raw, "tokenLimit", "token_limit", default=0)
token_limit = raw_token_limit or None
token_pct = _pct(total_tokens, raw_token_limit)
token_limit_src = src if raw_token_limit else None
# If we got an explicit typed limit but no legacy one, backfill legacy
# so existing dashboard code still works during the transition.
if token_limit is None:
if output_token_limit is not None:
token_limit = output_token_limit
token_pct = output_token_limit_pct
token_limit_src = output_token_limit_src
elif total_token_limit is not None:
token_limit = total_token_limit
token_pct = total_token_limit_pct
token_limit_src = total_token_limit_src
# ── Cost limit ────────────────────────────────────────────────────────────
raw_cost_limit = _get_float(status_raw, "costLimit", "cost_limit", "costLimitUsd", default=0.0)
cost_limit = raw_cost_limit or None
# Determine source for cost limit
if raw_cost_limit:
cost_limit_source = "configured_limit"
else:
cost_limit_source = None
cost_pct = int(min(100, total_cost * 100 / raw_cost_limit)) if raw_cost_limit else None
cost_limit = raw_cost_limit or None
cost_pct = _pct(total_cost, raw_cost_limit) if raw_cost_limit else None
cost_limit_src = src if raw_cost_limit else None
return RuntimeUsageCurrent(
total_cost_usd=total_cost,
total_tokens=total_tokens,
total_output_tokens=total_output_tokens,
total_calls=total_calls,
# legacy
token_limit=token_limit,
token_pct=token_pct,
cost_limit_usd=cost_limit,
cost_pct=cost_pct,
token_limit_source=token_limit_source,
cost_limit_source=cost_limit_source,
token_limit_source=token_limit_src,
cost_limit_source=cost_limit_src,
# typed
output_token_limit=output_token_limit,
output_token_limit_pct=output_token_limit_pct,
output_token_limit_source=output_token_limit_src,
total_token_limit=total_token_limit,
total_token_limit_pct=total_token_limit_pct,
total_token_limit_source=total_token_limit_src,
message_limit=message_limit,
message_pct=message_pct,
message_limit_source=message_limit_src,
)
@ -604,9 +653,14 @@ def _compute_burn_rate(
window: RuntimeUsageWindow,
now: datetime,
) -> RuntimeUsageBurnRate:
"""Compute tokens/min and cost/min from the most recent 60 minutes of sessions."""
"""Compute tokens/min and cost/min from the most recent 60 minutes of sessions.
Tracks total tokens (input+output) and output tokens separately so that
predictions against output-token limits use the correct numerator.
"""
cutoff = now - timedelta(minutes=60)
recent_tokens = 0
recent_output_tokens = 0
recent_cost = 0.0
for session in sessions:
@ -615,15 +669,14 @@ def _compute_burn_rate(
if ts is None or ts < cutoff:
continue
tokens = _parse_session_usage(session)
recent_tokens += tokens["input"] + tokens["output"]
recent_cost += _get_float(session, "cost", "cost_usd", "costUsd", default=0.0)
recent_tokens += tokens["input"] + tokens["output"]
recent_output_tokens += tokens["output"]
recent_cost += _get_float(session, "cost", "cost_usd", "costUsd", default=0.0)
# Rate per minute over the last 60 minutes
tokens_per_minute = round(recent_tokens / 60, 4)
cost_per_minute = round(recent_cost / 60, 8)
return RuntimeUsageBurnRate(
tokens_per_minute=tokens_per_minute,
cost_usd_per_minute=cost_per_minute,
tokens_per_minute=round(recent_tokens / 60, 4),
output_tokens_per_minute=round(recent_output_tokens / 60, 4),
cost_usd_per_minute=round(recent_cost / 60, 8),
)
@ -632,18 +685,79 @@ def _build_predictions(
burn_rate: RuntimeUsageBurnRate,
window: RuntimeUsageWindow,
) -> RuntimeUsagePredictions:
"""Estimate time-to-limit in ms based on total-token burn rate."""
if burn_rate.tokens_per_minute <= 0 or current.token_limit is None:
return RuntimeUsagePredictions(time_to_limit_ms=None, safe=True)
"""Estimate time-to-limit in ms using the most constrained matching limit.
tokens_remaining = current.token_limit - current.total_tokens
if tokens_remaining <= 0:
return RuntimeUsagePredictions(time_to_limit_ms=0, safe=False)
Priority order (tightest first):
1. output_token_limit vs output_tokens (burn: output_tokens_per_minute)
2. total_token_limit vs total_tokens (burn: tokens_per_minute)
3. legacy token_limit vs total_tokens (burn: tokens_per_minute)
4. message_limit vs total_calls (constant rate = calls / window_minutes)
minutes_to_limit = tokens_remaining / burn_rate.tokens_per_minute
time_to_limit_ms = int(minutes_to_limit * 60 * 1000)
Cost and request limits are not used for time-to-limit since they either
require billing data (cost) or are not the binding constraint in practice.
"""
candidates: list[tuple[int, str]] = [] # (time_to_limit_ms, kind)
# ── Output-token limit ────────────────────────────────────────────────────
if (
current.output_token_limit is not None
and burn_rate.output_tokens_per_minute > 0
):
remaining = current.output_token_limit - current.total_output_tokens
if remaining <= 0:
return RuntimeUsagePredictions(time_to_limit_ms=0, safe=False, limit_kind="output_tokens")
candidates.append((
int(remaining / burn_rate.output_tokens_per_minute * 60_000),
"output_tokens",
))
# ── Total-token limit ─────────────────────────────────────────────────────
if (
current.total_token_limit is not None
and burn_rate.tokens_per_minute > 0
):
remaining = current.total_token_limit - current.total_tokens
if remaining <= 0:
return RuntimeUsagePredictions(time_to_limit_ms=0, safe=False, limit_kind="total_tokens")
candidates.append((
int(remaining / burn_rate.tokens_per_minute * 60_000),
"total_tokens",
))
# ── Legacy token_limit (only when no typed token limit) ───────────────────
if (
not candidates
and current.token_limit is not None
and burn_rate.tokens_per_minute > 0
):
remaining = current.token_limit - current.total_tokens
if remaining <= 0:
return RuntimeUsagePredictions(time_to_limit_ms=0, safe=False, limit_kind="total_tokens")
candidates.append((
int(remaining / burn_rate.tokens_per_minute * 60_000),
"total_tokens",
))
# ── Message limit ─────────────────────────────────────────────────────────
if current.message_limit is not None and current.message_limit > 0:
window_minutes = max(window.reset_in_ms / 60_000, 1)
calls_per_minute = current.total_calls / window_minutes if window_minutes > 0 else 0
if calls_per_minute > 0:
remaining = current.message_limit - current.total_calls
if remaining <= 0:
return RuntimeUsagePredictions(time_to_limit_ms=0, safe=False, limit_kind="messages")
candidates.append((
int(remaining / calls_per_minute * 60_000),
"messages",
))
if not candidates:
return RuntimeUsagePredictions(time_to_limit_ms=None, safe=True, limit_kind="total_tokens")
# Pick the most constrained (smallest time) — that is what will actually block work.
time_to_limit_ms, kind = min(candidates, key=lambda c: c[0])
safe = time_to_limit_ms > window.reset_in_ms
return RuntimeUsagePredictions(time_to_limit_ms=time_to_limit_ms, safe=safe)
return RuntimeUsagePredictions(time_to_limit_ms=time_to_limit_ms, safe=safe, limit_kind=kind)
# ---------------------------------------------------------------------------

View File

@ -16,15 +16,24 @@ import { DashboardEmptyState } from "./DashboardEmptyState";
export interface AggregatedRuntimeUsage {
totalCostUsd: number;
totalTokens: number;
totalOutputTokens: number;
totalCalls: number;
// Legacy (backwards compat)
tokenLimit: number | null;
tokenPct: number | null;
costLimitUsd: number | null;
// Typed limits (Phase 4)
outputTokenLimit: number | null;
outputTokenLimitPct: number | null;
messageLimit: number | null;
messagePct: number | null;
limitKind: string; // which limit drove the time-to-limit prediction
resetInMs: number;
resetsAt: string;
timeToLimitMs: number | null;
safe: boolean;
tokensPerMinute: number;
outputTokensPerMinute: number;
costUsdPerMinute: number;
perModel: Record<string, ModelUsageEntry>;
topSessions: TopSession[];
@ -54,15 +63,22 @@ export function aggregateRuntimeUsage(
return {
totalCostUsd: 0,
totalTokens: 0,
totalOutputTokens: 0,
totalCalls: 0,
tokenLimit: null,
tokenPct: null,
costLimitUsd: null,
outputTokenLimit: null,
outputTokenLimitPct: null,
messageLimit: null,
messagePct: null,
limitKind: "total_tokens",
resetInMs: 0,
resetsAt: "",
timeToLimitMs: null,
safe: true,
tokensPerMinute: 0,
outputTokensPerMinute: 0,
costUsdPerMinute: 0,
perModel: {},
topSessions: [],
@ -71,21 +87,24 @@ export function aggregateRuntimeUsage(
let totalCostUsd = 0;
let totalTokens = 0;
let totalOutputTokens = 0;
let totalCalls = 0;
let tokensPerMinute = 0;
let outputTokensPerMinute = 0;
let costUsdPerMinute = 0;
// Use the window that resets soonest as the binding constraint
let resetInMs = valid[0].window.reset_in_ms;
let resetsAt = valid[0].window.resets_at;
const perModel: Record<string, ModelUsageEntry> = {};
const allSessions: TopSession[] = [];
for (const r of valid) {
totalCostUsd += r.current.total_cost_usd;
totalTokens += r.current.total_tokens;
totalCalls += r.current.total_calls;
tokensPerMinute += r.burn_rate.tokens_per_minute;
costUsdPerMinute += r.burn_rate.cost_usd_per_minute;
totalCostUsd += r.current.total_cost_usd;
totalTokens += r.current.total_tokens;
totalOutputTokens += r.current.total_output_tokens ?? 0;
totalCalls += r.current.total_calls;
tokensPerMinute += r.burn_rate.tokens_per_minute;
outputTokensPerMinute += r.burn_rate.output_tokens_per_minute ?? 0;
costUsdPerMinute += r.burn_rate.cost_usd_per_minute;
if (r.window.reset_in_ms < resetInMs) {
resetInMs = r.window.reset_in_ms;
@ -113,25 +132,45 @@ export function aggregateRuntimeUsage(
allSessions.push(...r.top_sessions);
}
// Aggregate token limits — only meaningful if all gateways share one limit
const limits = valid.map(r => r.current.token_limit).filter((v): v is number => v !== null);
// Legacy token limit
const limits = valid.map(r => r.current.token_limit).filter((v): v is number => v !== null);
const tokenLimit = limits.length > 0 ? limits.reduce((a, b) => a + b, 0) : null;
const tokenPct = tokenLimit ? Math.min(100, Math.round(totalTokens * 100 / tokenLimit)) : null;
const costLimits = valid.map(r => r.current.cost_limit_usd).filter((v): v is number => v !== null);
const costLimitUsd = costLimits.length > 0 ? costLimits.reduce((a, b) => a + b, 0) : null;
// Re-derive time-to-limit from aggregated burn rate
// Typed limits
const outLimits = valid.map(r => r.current.output_token_limit).filter((v): v is number => v !== null);
const outputTokenLimit = outLimits.length > 0 ? outLimits.reduce((a, b) => a + b, 0) : null;
const outputTokenLimitPct = outputTokenLimit
? Math.min(100, Math.round(totalOutputTokens * 100 / outputTokenLimit))
: null;
const msgLimits = valid.map(r => r.current.message_limit).filter((v): v is number => v !== null);
const messageLimit = msgLimits.length > 0 ? msgLimits.reduce((a, b) => a + b, 0) : null;
const messagePct = messageLimit ? Math.min(100, Math.round(totalCalls * 100 / messageLimit)) : null;
// Re-derive time-to-limit using the most constrained matching limit
let timeToLimitMs: number | null = null;
let safe = true;
let limitKind = "total_tokens";
const candidates: Array<{ ms: number; kind: string }> = [];
if (outputTokenLimit !== null && outputTokensPerMinute > 0) {
const rem = outputTokenLimit - totalOutputTokens;
if (rem <= 0) return { totalCostUsd: Math.round(totalCostUsd * 1e8) / 1e8, totalTokens, totalOutputTokens, totalCalls, tokenLimit, tokenPct, costLimitUsd, outputTokenLimit, outputTokenLimitPct, messageLimit, messagePct, limitKind: "output_tokens", resetInMs, resetsAt, timeToLimitMs: 0, safe: false, tokensPerMinute: Math.round(tokensPerMinute * 100) / 100, outputTokensPerMinute: Math.round(outputTokensPerMinute * 100) / 100, costUsdPerMinute: Math.round(costUsdPerMinute * 1e8) / 1e8, perModel, topSessions: [...allSessions].sort((a, b) => b.cost_usd - a.cost_usd).slice(0, 10) };
candidates.push({ ms: Math.round(rem / outputTokensPerMinute * 60_000), kind: "output_tokens" });
}
if (tokenLimit !== null && tokensPerMinute > 0) {
const remaining = tokenLimit - totalTokens;
if (remaining <= 0) {
timeToLimitMs = 0;
safe = false;
} else {
timeToLimitMs = Math.round((remaining / tokensPerMinute) * 60 * 1000);
safe = timeToLimitMs > resetInMs;
}
const rem = tokenLimit - totalTokens;
if (rem <= 0) return { totalCostUsd: Math.round(totalCostUsd * 1e8) / 1e8, totalTokens, totalOutputTokens, totalCalls, tokenLimit, tokenPct, costLimitUsd, outputTokenLimit, outputTokenLimitPct, messageLimit, messagePct, limitKind: "total_tokens", resetInMs, resetsAt, timeToLimitMs: 0, safe: false, tokensPerMinute: Math.round(tokensPerMinute * 100) / 100, outputTokensPerMinute: Math.round(outputTokensPerMinute * 100) / 100, costUsdPerMinute: Math.round(costUsdPerMinute * 1e8) / 1e8, perModel, topSessions: [...allSessions].sort((a, b) => b.cost_usd - a.cost_usd).slice(0, 10) };
candidates.push({ ms: Math.round(rem / tokensPerMinute * 60_000), kind: "total_tokens" });
}
if (candidates.length > 0) {
const tightest = candidates.reduce((a, b) => a.ms < b.ms ? a : b);
timeToLimitMs = tightest.ms;
limitKind = tightest.kind;
safe = timeToLimitMs > resetInMs;
}
const topSessions = [...allSessions]
@ -141,15 +180,22 @@ export function aggregateRuntimeUsage(
return {
totalCostUsd: Math.round(totalCostUsd * 1e8) / 1e8,
totalTokens,
totalOutputTokens,
totalCalls,
tokenLimit,
tokenPct,
costLimitUsd,
outputTokenLimit,
outputTokenLimitPct,
messageLimit,
messagePct,
limitKind,
resetInMs,
resetsAt,
timeToLimitMs,
safe,
tokensPerMinute: Math.round(tokensPerMinute * 100) / 100,
outputTokensPerMinute: Math.round(outputTokensPerMinute * 100) / 100,
costUsdPerMinute: Math.round(costUsdPerMinute * 1e8) / 1e8,
perModel,
topSessions,
@ -343,14 +389,33 @@ export function RuntimeUsageSection({
<StatCard
label="Time to Limit"
value={usage.timeToLimitMs === null ? "—" : usage.timeToLimitMs === 0 ? "At limit" : fmtMs(usage.timeToLimitMs)}
sub={usage.tokenLimit ? `${usage.tokenPct ?? 0}% of ${fmtTokens(usage.tokenLimit)}` : undefined}
sub={(() => {
if (usage.outputTokenLimit) {
return `${usage.outputTokenLimitPct ?? 0}% of ${fmtTokens(usage.outputTokenLimit)} out`;
}
if (usage.tokenLimit) {
return `${usage.tokenPct ?? 0}% of ${fmtTokens(usage.tokenLimit)}`;
}
if (usage.messageLimit) {
return `${usage.messagePct ?? 0}% of ${usage.messageLimit} msgs`;
}
return undefined;
})()}
tone={safenessTone}
icon={<TrendingDown className="h-3 w-3" />}
/>
<StatCard
label="Burn Rate"
value={usage.costUsdPerMinute > 0 ? `${fmtCost(usage.costUsdPerMinute)}/m` : "—"}
sub={usage.tokensPerMinute > 0 ? `${fmtTokens(usage.tokensPerMinute)} tok/m` : undefined}
sub={(() => {
if (usage.outputTokensPerMinute > 0 && usage.outputTokenLimit) {
return `${fmtTokens(usage.outputTokensPerMinute)} out-tok/m`;
}
if (usage.tokensPerMinute > 0) {
return `${fmtTokens(usage.tokensPerMinute)} tok/m`;
}
return undefined;
})()}
icon={<Flame className="h-3 w-3" />}
/>
</div>
@ -374,8 +439,10 @@ export function RuntimeUsageSection({
<th className="px-3 py-2 text-right font-semibold text-muted">Tokens out</th>
<th className="px-3 py-2 text-right font-semibold text-muted">Cost</th>
<th className="px-3 py-2 text-right font-semibold text-muted">Calls</th>
{usage.tokenLimit !== null && (
<th className="px-3 py-2 text-right font-semibold text-muted">% limit</th>
{(usage.outputTokenLimit !== null || usage.tokenLimit !== null) && (
<th className="px-3 py-2 text-right font-semibold text-muted">
{usage.outputTokenLimit !== null ? "% out limit" : "% limit"}
</th>
)}
</tr>
</thead>
@ -396,7 +463,12 @@ export function RuntimeUsageSection({
{entry.unpriced ? <span className="text-[color:var(--warning)]">*</span> : fmtCost(entry.cost_usd)}
</td>
<td className="px-3 py-2 text-right tabular-nums text-muted">{entry.calls}</td>
{usage.tokenLimit !== null && (
{usage.outputTokenLimit !== null && (
<td className="px-3 py-2 text-right tabular-nums text-muted">
{Math.round(entry.output_tokens * 100 / usage.outputTokenLimit)}%
</td>
)}
{usage.outputTokenLimit === null && usage.tokenLimit !== null && (
<td className="px-3 py-2 text-right tabular-nums text-muted">
{Math.round(entry.total_tokens * 100 / usage.tokenLimit)}%
</td>