feat(usage): separate limit types — typed limits for output tokens, total tokens, messages (#39)
- Add typed limit fields to RuntimeUsageCurrent: output_token_limit, total_token_limit, message_limit with matching pct and source - Add total_output_tokens and output_tokens_per_minute to burn rate - _build_current() now computes each pct from matching units only - Legacy token_limit backfilled from typed limits for backwards compat - Frontend aggregateRuntimeUsage() tracks typed limits separately - limit_kind field on predictions indicates which limit drove time-to-limit
This commit is contained in:
parent
02eb03d408
commit
5217a70c9f
|
|
@ -25,21 +25,44 @@ class RuntimeUsageCurrent(SQLModel):
|
|||
"""Aggregated totals within the current window."""
|
||||
|
||||
total_cost_usd: float
|
||||
total_tokens: int # input + output across all sessions
|
||||
total_tokens: int # input + output across all sessions
|
||||
total_output_tokens: int = 0 # output tokens only — used with output_token_limit
|
||||
total_calls: int
|
||||
token_limit: int | None = None # configured limit; None = unknown
|
||||
token_pct: int | None = None # 0–100; None when limit unknown
|
||||
|
||||
# ── Legacy fields (kept for backwards compat) ────────────────────────────
|
||||
# token_limit is ambiguous (could be total or output); use typed fields below
|
||||
# when the limit kind is known.
|
||||
token_limit: int | None = None
|
||||
token_pct: int | None = None
|
||||
cost_limit_usd: float | None = None
|
||||
cost_pct: int | None = None
|
||||
# Source and confidence for the limits
|
||||
token_limit_source: str | None = None
|
||||
cost_limit_source: str | None = None
|
||||
|
||||
# ── Typed limits (Phase 4) ────────────────────────────────────────────────
|
||||
# Each field pairs a limit with a percent computed from matching units only.
|
||||
|
||||
# Output-token limit: compared against output tokens only, never input/cache.
|
||||
output_token_limit: int | None = None
|
||||
output_token_limit_pct: int | None = None
|
||||
output_token_limit_source: str | None = None
|
||||
|
||||
# Total-token limit: compared against input + output combined.
|
||||
total_token_limit: int | None = None
|
||||
total_token_limit_pct: int | None = None
|
||||
total_token_limit_source: str | None = None
|
||||
|
||||
# Message/request limit: compared against call count, never token totals.
|
||||
message_limit: int | None = None
|
||||
message_pct: int | None = None
|
||||
message_limit_source: str | None = None
|
||||
|
||||
|
||||
class RuntimeUsageBurnRate(SQLModel):
|
||||
"""Recent token and cost velocity (last 60 minutes of the window)."""
|
||||
|
||||
tokens_per_minute: float
|
||||
tokens_per_minute: float # input + output combined
|
||||
output_tokens_per_minute: float = 0.0 # output tokens only
|
||||
cost_usd_per_minute: float
|
||||
|
||||
|
||||
|
|
@ -48,6 +71,7 @@ class RuntimeUsagePredictions(SQLModel):
|
|||
|
||||
time_to_limit_ms: int | None = None # None when limit or burn rate unknown
|
||||
safe: bool # True if time_to_limit > reset_in_ms (will reset before hitting limit)
|
||||
limit_kind: str = "total_tokens" # which limit drove this prediction
|
||||
|
||||
|
||||
class ModelUsageEntry(SQLModel):
|
||||
|
|
|
|||
|
|
@ -544,58 +544,107 @@ def _build_window(
|
|||
)
|
||||
|
||||
|
||||
def _limit_source(status_raw: dict[str, Any]) -> str:
|
||||
"""Return the appropriate source label for a limit read from gateway status."""
|
||||
has_rate_limit_headers = (
|
||||
status_raw.get("x_ratelimit_remaining") or
|
||||
status_raw.get("x_ratelimit_limit") or
|
||||
status_raw.get("anthropic_ratelimit_remaining") or
|
||||
status_raw.get("anthropic_ratelimit_limit")
|
||||
)
|
||||
return "provider_api_rate_limit" if has_rate_limit_headers else "configured_limit"
|
||||
|
||||
|
||||
def _pct(numerator: int | float, denominator: int | float) -> int | None:
|
||||
if not denominator:
|
||||
return None
|
||||
return int(min(100, numerator * 100 // denominator))
|
||||
|
||||
|
||||
def _build_current(
|
||||
per_model: dict[str, ModelUsageEntry],
|
||||
status_raw: dict[str, Any],
|
||||
account_key: str = "default",
|
||||
) -> RuntimeUsageCurrent:
|
||||
total_cost = round(sum(e.cost_usd for e in per_model.values()), 8)
|
||||
total_tokens = sum(e.total_tokens for e in per_model.values())
|
||||
total_calls = sum(e.calls for e in per_model.values())
|
||||
total_cost = round(sum(e.cost_usd for e in per_model.values()), 8)
|
||||
total_tokens = sum(e.total_tokens for e in per_model.values())
|
||||
total_output_tokens = sum(e.output_tokens for e in per_model.values())
|
||||
total_calls = sum(e.calls for e in per_model.values())
|
||||
|
||||
# Try to get configured limits from the gateway status
|
||||
raw_token_limit = _get_int(status_raw, "tokenLimit", "token_limit", "messageLimit", "message_limit", default=0)
|
||||
token_limit = raw_token_limit or None
|
||||
|
||||
# Determine source for token limit
|
||||
if raw_token_limit:
|
||||
# Check for API rate-limit headers
|
||||
has_rate_limit_headers = (
|
||||
status_raw.get("x_ratelimit_remaining") or
|
||||
status_raw.get("x_ratelimit_limit") or
|
||||
status_raw.get("anthropic_ratelimit_remaining") or
|
||||
status_raw.get("anthropic_ratelimit_limit")
|
||||
)
|
||||
if has_rate_limit_headers:
|
||||
token_limit_source = "provider_api_rate_limit"
|
||||
else:
|
||||
token_limit_source = "configured_limit"
|
||||
else:
|
||||
token_limit_source = None
|
||||
|
||||
token_pct = int(min(100, total_tokens * 100 // raw_token_limit)) if raw_token_limit else None
|
||||
src = _limit_source(status_raw)
|
||||
|
||||
# ── Explicit output-token limit ───────────────────────────────────────────
|
||||
raw_output_limit = _get_int(
|
||||
status_raw, "outputTokenLimit", "output_token_limit", default=0
|
||||
)
|
||||
output_token_limit = raw_output_limit or None
|
||||
output_token_limit_pct = _pct(total_output_tokens, raw_output_limit)
|
||||
output_token_limit_src = src if raw_output_limit else None
|
||||
|
||||
# ── Explicit total-token limit ────────────────────────────────────────────
|
||||
raw_total_limit = _get_int(
|
||||
status_raw, "totalTokenLimit", "total_token_limit", default=0
|
||||
)
|
||||
total_token_limit = raw_total_limit or None
|
||||
total_token_limit_pct = _pct(total_tokens, raw_total_limit)
|
||||
total_token_limit_src = src if raw_total_limit else None
|
||||
|
||||
# ── Message/request limit (count-based, never token-based) ───────────────
|
||||
raw_message_limit = _get_int(
|
||||
status_raw, "messageLimit", "message_limit", "requestLimit", "request_limit",
|
||||
default=0,
|
||||
)
|
||||
message_limit = raw_message_limit or None
|
||||
message_pct = _pct(total_calls, raw_message_limit)
|
||||
message_limit_src = src if raw_message_limit else None
|
||||
|
||||
# ── Legacy token_limit (ambiguous kind — maps to tokenLimit only) ─────────
|
||||
# Do NOT fold messageLimit into this; keep units separate.
|
||||
raw_token_limit = _get_int(status_raw, "tokenLimit", "token_limit", default=0)
|
||||
token_limit = raw_token_limit or None
|
||||
token_pct = _pct(total_tokens, raw_token_limit)
|
||||
token_limit_src = src if raw_token_limit else None
|
||||
|
||||
# If we got an explicit typed limit but no legacy one, backfill legacy
|
||||
# so existing dashboard code still works during the transition.
|
||||
if token_limit is None:
|
||||
if output_token_limit is not None:
|
||||
token_limit = output_token_limit
|
||||
token_pct = output_token_limit_pct
|
||||
token_limit_src = output_token_limit_src
|
||||
elif total_token_limit is not None:
|
||||
token_limit = total_token_limit
|
||||
token_pct = total_token_limit_pct
|
||||
token_limit_src = total_token_limit_src
|
||||
|
||||
# ── Cost limit ────────────────────────────────────────────────────────────
|
||||
raw_cost_limit = _get_float(status_raw, "costLimit", "cost_limit", "costLimitUsd", default=0.0)
|
||||
cost_limit = raw_cost_limit or None
|
||||
|
||||
# Determine source for cost limit
|
||||
if raw_cost_limit:
|
||||
cost_limit_source = "configured_limit"
|
||||
else:
|
||||
cost_limit_source = None
|
||||
|
||||
cost_pct = int(min(100, total_cost * 100 / raw_cost_limit)) if raw_cost_limit else None
|
||||
cost_limit = raw_cost_limit or None
|
||||
cost_pct = _pct(total_cost, raw_cost_limit) if raw_cost_limit else None
|
||||
cost_limit_src = src if raw_cost_limit else None
|
||||
|
||||
return RuntimeUsageCurrent(
|
||||
total_cost_usd=total_cost,
|
||||
total_tokens=total_tokens,
|
||||
total_output_tokens=total_output_tokens,
|
||||
total_calls=total_calls,
|
||||
# legacy
|
||||
token_limit=token_limit,
|
||||
token_pct=token_pct,
|
||||
cost_limit_usd=cost_limit,
|
||||
cost_pct=cost_pct,
|
||||
token_limit_source=token_limit_source,
|
||||
cost_limit_source=cost_limit_source,
|
||||
token_limit_source=token_limit_src,
|
||||
cost_limit_source=cost_limit_src,
|
||||
# typed
|
||||
output_token_limit=output_token_limit,
|
||||
output_token_limit_pct=output_token_limit_pct,
|
||||
output_token_limit_source=output_token_limit_src,
|
||||
total_token_limit=total_token_limit,
|
||||
total_token_limit_pct=total_token_limit_pct,
|
||||
total_token_limit_source=total_token_limit_src,
|
||||
message_limit=message_limit,
|
||||
message_pct=message_pct,
|
||||
message_limit_source=message_limit_src,
|
||||
)
|
||||
|
||||
|
||||
|
|
@ -604,9 +653,14 @@ def _compute_burn_rate(
|
|||
window: RuntimeUsageWindow,
|
||||
now: datetime,
|
||||
) -> RuntimeUsageBurnRate:
|
||||
"""Compute tokens/min and cost/min from the most recent 60 minutes of sessions."""
|
||||
"""Compute tokens/min and cost/min from the most recent 60 minutes of sessions.
|
||||
|
||||
Tracks total tokens (input+output) and output tokens separately so that
|
||||
predictions against output-token limits use the correct numerator.
|
||||
"""
|
||||
cutoff = now - timedelta(minutes=60)
|
||||
recent_tokens = 0
|
||||
recent_output_tokens = 0
|
||||
recent_cost = 0.0
|
||||
|
||||
for session in sessions:
|
||||
|
|
@ -615,15 +669,14 @@ def _compute_burn_rate(
|
|||
if ts is None or ts < cutoff:
|
||||
continue
|
||||
tokens = _parse_session_usage(session)
|
||||
recent_tokens += tokens["input"] + tokens["output"]
|
||||
recent_cost += _get_float(session, "cost", "cost_usd", "costUsd", default=0.0)
|
||||
recent_tokens += tokens["input"] + tokens["output"]
|
||||
recent_output_tokens += tokens["output"]
|
||||
recent_cost += _get_float(session, "cost", "cost_usd", "costUsd", default=0.0)
|
||||
|
||||
# Rate per minute over the last 60 minutes
|
||||
tokens_per_minute = round(recent_tokens / 60, 4)
|
||||
cost_per_minute = round(recent_cost / 60, 8)
|
||||
return RuntimeUsageBurnRate(
|
||||
tokens_per_minute=tokens_per_minute,
|
||||
cost_usd_per_minute=cost_per_minute,
|
||||
tokens_per_minute=round(recent_tokens / 60, 4),
|
||||
output_tokens_per_minute=round(recent_output_tokens / 60, 4),
|
||||
cost_usd_per_minute=round(recent_cost / 60, 8),
|
||||
)
|
||||
|
||||
|
||||
|
|
@ -632,18 +685,79 @@ def _build_predictions(
|
|||
burn_rate: RuntimeUsageBurnRate,
|
||||
window: RuntimeUsageWindow,
|
||||
) -> RuntimeUsagePredictions:
|
||||
"""Estimate time-to-limit in ms based on total-token burn rate."""
|
||||
if burn_rate.tokens_per_minute <= 0 or current.token_limit is None:
|
||||
return RuntimeUsagePredictions(time_to_limit_ms=None, safe=True)
|
||||
"""Estimate time-to-limit in ms using the most constrained matching limit.
|
||||
|
||||
tokens_remaining = current.token_limit - current.total_tokens
|
||||
if tokens_remaining <= 0:
|
||||
return RuntimeUsagePredictions(time_to_limit_ms=0, safe=False)
|
||||
Priority order (tightest first):
|
||||
1. output_token_limit vs output_tokens (burn: output_tokens_per_minute)
|
||||
2. total_token_limit vs total_tokens (burn: tokens_per_minute)
|
||||
3. legacy token_limit vs total_tokens (burn: tokens_per_minute)
|
||||
4. message_limit vs total_calls (constant rate = calls / window_minutes)
|
||||
|
||||
minutes_to_limit = tokens_remaining / burn_rate.tokens_per_minute
|
||||
time_to_limit_ms = int(minutes_to_limit * 60 * 1000)
|
||||
Cost and request limits are not used for time-to-limit since they either
|
||||
require billing data (cost) or are not the binding constraint in practice.
|
||||
"""
|
||||
candidates: list[tuple[int, str]] = [] # (time_to_limit_ms, kind)
|
||||
|
||||
# ── Output-token limit ────────────────────────────────────────────────────
|
||||
if (
|
||||
current.output_token_limit is not None
|
||||
and burn_rate.output_tokens_per_minute > 0
|
||||
):
|
||||
remaining = current.output_token_limit - current.total_output_tokens
|
||||
if remaining <= 0:
|
||||
return RuntimeUsagePredictions(time_to_limit_ms=0, safe=False, limit_kind="output_tokens")
|
||||
candidates.append((
|
||||
int(remaining / burn_rate.output_tokens_per_minute * 60_000),
|
||||
"output_tokens",
|
||||
))
|
||||
|
||||
# ── Total-token limit ─────────────────────────────────────────────────────
|
||||
if (
|
||||
current.total_token_limit is not None
|
||||
and burn_rate.tokens_per_minute > 0
|
||||
):
|
||||
remaining = current.total_token_limit - current.total_tokens
|
||||
if remaining <= 0:
|
||||
return RuntimeUsagePredictions(time_to_limit_ms=0, safe=False, limit_kind="total_tokens")
|
||||
candidates.append((
|
||||
int(remaining / burn_rate.tokens_per_minute * 60_000),
|
||||
"total_tokens",
|
||||
))
|
||||
|
||||
# ── Legacy token_limit (only when no typed token limit) ───────────────────
|
||||
if (
|
||||
not candidates
|
||||
and current.token_limit is not None
|
||||
and burn_rate.tokens_per_minute > 0
|
||||
):
|
||||
remaining = current.token_limit - current.total_tokens
|
||||
if remaining <= 0:
|
||||
return RuntimeUsagePredictions(time_to_limit_ms=0, safe=False, limit_kind="total_tokens")
|
||||
candidates.append((
|
||||
int(remaining / burn_rate.tokens_per_minute * 60_000),
|
||||
"total_tokens",
|
||||
))
|
||||
|
||||
# ── Message limit ─────────────────────────────────────────────────────────
|
||||
if current.message_limit is not None and current.message_limit > 0:
|
||||
window_minutes = max(window.reset_in_ms / 60_000, 1)
|
||||
calls_per_minute = current.total_calls / window_minutes if window_minutes > 0 else 0
|
||||
if calls_per_minute > 0:
|
||||
remaining = current.message_limit - current.total_calls
|
||||
if remaining <= 0:
|
||||
return RuntimeUsagePredictions(time_to_limit_ms=0, safe=False, limit_kind="messages")
|
||||
candidates.append((
|
||||
int(remaining / calls_per_minute * 60_000),
|
||||
"messages",
|
||||
))
|
||||
|
||||
if not candidates:
|
||||
return RuntimeUsagePredictions(time_to_limit_ms=None, safe=True, limit_kind="total_tokens")
|
||||
|
||||
# Pick the most constrained (smallest time) — that is what will actually block work.
|
||||
time_to_limit_ms, kind = min(candidates, key=lambda c: c[0])
|
||||
safe = time_to_limit_ms > window.reset_in_ms
|
||||
return RuntimeUsagePredictions(time_to_limit_ms=time_to_limit_ms, safe=safe)
|
||||
return RuntimeUsagePredictions(time_to_limit_ms=time_to_limit_ms, safe=safe, limit_kind=kind)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
|
|
|
|||
|
|
@ -16,15 +16,24 @@ import { DashboardEmptyState } from "./DashboardEmptyState";
|
|||
export interface AggregatedRuntimeUsage {
|
||||
totalCostUsd: number;
|
||||
totalTokens: number;
|
||||
totalOutputTokens: number;
|
||||
totalCalls: number;
|
||||
// Legacy (backwards compat)
|
||||
tokenLimit: number | null;
|
||||
tokenPct: number | null;
|
||||
costLimitUsd: number | null;
|
||||
// Typed limits (Phase 4)
|
||||
outputTokenLimit: number | null;
|
||||
outputTokenLimitPct: number | null;
|
||||
messageLimit: number | null;
|
||||
messagePct: number | null;
|
||||
limitKind: string; // which limit drove the time-to-limit prediction
|
||||
resetInMs: number;
|
||||
resetsAt: string;
|
||||
timeToLimitMs: number | null;
|
||||
safe: boolean;
|
||||
tokensPerMinute: number;
|
||||
outputTokensPerMinute: number;
|
||||
costUsdPerMinute: number;
|
||||
perModel: Record<string, ModelUsageEntry>;
|
||||
topSessions: TopSession[];
|
||||
|
|
@ -54,15 +63,22 @@ export function aggregateRuntimeUsage(
|
|||
return {
|
||||
totalCostUsd: 0,
|
||||
totalTokens: 0,
|
||||
totalOutputTokens: 0,
|
||||
totalCalls: 0,
|
||||
tokenLimit: null,
|
||||
tokenPct: null,
|
||||
costLimitUsd: null,
|
||||
outputTokenLimit: null,
|
||||
outputTokenLimitPct: null,
|
||||
messageLimit: null,
|
||||
messagePct: null,
|
||||
limitKind: "total_tokens",
|
||||
resetInMs: 0,
|
||||
resetsAt: "",
|
||||
timeToLimitMs: null,
|
||||
safe: true,
|
||||
tokensPerMinute: 0,
|
||||
outputTokensPerMinute: 0,
|
||||
costUsdPerMinute: 0,
|
||||
perModel: {},
|
||||
topSessions: [],
|
||||
|
|
@ -71,21 +87,24 @@ export function aggregateRuntimeUsage(
|
|||
|
||||
let totalCostUsd = 0;
|
||||
let totalTokens = 0;
|
||||
let totalOutputTokens = 0;
|
||||
let totalCalls = 0;
|
||||
let tokensPerMinute = 0;
|
||||
let outputTokensPerMinute = 0;
|
||||
let costUsdPerMinute = 0;
|
||||
// Use the window that resets soonest as the binding constraint
|
||||
let resetInMs = valid[0].window.reset_in_ms;
|
||||
let resetsAt = valid[0].window.resets_at;
|
||||
const perModel: Record<string, ModelUsageEntry> = {};
|
||||
const allSessions: TopSession[] = [];
|
||||
|
||||
for (const r of valid) {
|
||||
totalCostUsd += r.current.total_cost_usd;
|
||||
totalTokens += r.current.total_tokens;
|
||||
totalCalls += r.current.total_calls;
|
||||
tokensPerMinute += r.burn_rate.tokens_per_minute;
|
||||
costUsdPerMinute += r.burn_rate.cost_usd_per_minute;
|
||||
totalCostUsd += r.current.total_cost_usd;
|
||||
totalTokens += r.current.total_tokens;
|
||||
totalOutputTokens += r.current.total_output_tokens ?? 0;
|
||||
totalCalls += r.current.total_calls;
|
||||
tokensPerMinute += r.burn_rate.tokens_per_minute;
|
||||
outputTokensPerMinute += r.burn_rate.output_tokens_per_minute ?? 0;
|
||||
costUsdPerMinute += r.burn_rate.cost_usd_per_minute;
|
||||
|
||||
if (r.window.reset_in_ms < resetInMs) {
|
||||
resetInMs = r.window.reset_in_ms;
|
||||
|
|
@ -113,25 +132,45 @@ export function aggregateRuntimeUsage(
|
|||
allSessions.push(...r.top_sessions);
|
||||
}
|
||||
|
||||
// Aggregate token limits — only meaningful if all gateways share one limit
|
||||
const limits = valid.map(r => r.current.token_limit).filter((v): v is number => v !== null);
|
||||
// Legacy token limit
|
||||
const limits = valid.map(r => r.current.token_limit).filter((v): v is number => v !== null);
|
||||
const tokenLimit = limits.length > 0 ? limits.reduce((a, b) => a + b, 0) : null;
|
||||
const tokenPct = tokenLimit ? Math.min(100, Math.round(totalTokens * 100 / tokenLimit)) : null;
|
||||
const costLimits = valid.map(r => r.current.cost_limit_usd).filter((v): v is number => v !== null);
|
||||
const costLimitUsd = costLimits.length > 0 ? costLimits.reduce((a, b) => a + b, 0) : null;
|
||||
|
||||
// Re-derive time-to-limit from aggregated burn rate
|
||||
// Typed limits
|
||||
const outLimits = valid.map(r => r.current.output_token_limit).filter((v): v is number => v !== null);
|
||||
const outputTokenLimit = outLimits.length > 0 ? outLimits.reduce((a, b) => a + b, 0) : null;
|
||||
const outputTokenLimitPct = outputTokenLimit
|
||||
? Math.min(100, Math.round(totalOutputTokens * 100 / outputTokenLimit))
|
||||
: null;
|
||||
|
||||
const msgLimits = valid.map(r => r.current.message_limit).filter((v): v is number => v !== null);
|
||||
const messageLimit = msgLimits.length > 0 ? msgLimits.reduce((a, b) => a + b, 0) : null;
|
||||
const messagePct = messageLimit ? Math.min(100, Math.round(totalCalls * 100 / messageLimit)) : null;
|
||||
|
||||
// Re-derive time-to-limit using the most constrained matching limit
|
||||
let timeToLimitMs: number | null = null;
|
||||
let safe = true;
|
||||
let limitKind = "total_tokens";
|
||||
const candidates: Array<{ ms: number; kind: string }> = [];
|
||||
|
||||
if (outputTokenLimit !== null && outputTokensPerMinute > 0) {
|
||||
const rem = outputTokenLimit - totalOutputTokens;
|
||||
if (rem <= 0) return { totalCostUsd: Math.round(totalCostUsd * 1e8) / 1e8, totalTokens, totalOutputTokens, totalCalls, tokenLimit, tokenPct, costLimitUsd, outputTokenLimit, outputTokenLimitPct, messageLimit, messagePct, limitKind: "output_tokens", resetInMs, resetsAt, timeToLimitMs: 0, safe: false, tokensPerMinute: Math.round(tokensPerMinute * 100) / 100, outputTokensPerMinute: Math.round(outputTokensPerMinute * 100) / 100, costUsdPerMinute: Math.round(costUsdPerMinute * 1e8) / 1e8, perModel, topSessions: [...allSessions].sort((a, b) => b.cost_usd - a.cost_usd).slice(0, 10) };
|
||||
candidates.push({ ms: Math.round(rem / outputTokensPerMinute * 60_000), kind: "output_tokens" });
|
||||
}
|
||||
if (tokenLimit !== null && tokensPerMinute > 0) {
|
||||
const remaining = tokenLimit - totalTokens;
|
||||
if (remaining <= 0) {
|
||||
timeToLimitMs = 0;
|
||||
safe = false;
|
||||
} else {
|
||||
timeToLimitMs = Math.round((remaining / tokensPerMinute) * 60 * 1000);
|
||||
safe = timeToLimitMs > resetInMs;
|
||||
}
|
||||
const rem = tokenLimit - totalTokens;
|
||||
if (rem <= 0) return { totalCostUsd: Math.round(totalCostUsd * 1e8) / 1e8, totalTokens, totalOutputTokens, totalCalls, tokenLimit, tokenPct, costLimitUsd, outputTokenLimit, outputTokenLimitPct, messageLimit, messagePct, limitKind: "total_tokens", resetInMs, resetsAt, timeToLimitMs: 0, safe: false, tokensPerMinute: Math.round(tokensPerMinute * 100) / 100, outputTokensPerMinute: Math.round(outputTokensPerMinute * 100) / 100, costUsdPerMinute: Math.round(costUsdPerMinute * 1e8) / 1e8, perModel, topSessions: [...allSessions].sort((a, b) => b.cost_usd - a.cost_usd).slice(0, 10) };
|
||||
candidates.push({ ms: Math.round(rem / tokensPerMinute * 60_000), kind: "total_tokens" });
|
||||
}
|
||||
if (candidates.length > 0) {
|
||||
const tightest = candidates.reduce((a, b) => a.ms < b.ms ? a : b);
|
||||
timeToLimitMs = tightest.ms;
|
||||
limitKind = tightest.kind;
|
||||
safe = timeToLimitMs > resetInMs;
|
||||
}
|
||||
|
||||
const topSessions = [...allSessions]
|
||||
|
|
@ -141,15 +180,22 @@ export function aggregateRuntimeUsage(
|
|||
return {
|
||||
totalCostUsd: Math.round(totalCostUsd * 1e8) / 1e8,
|
||||
totalTokens,
|
||||
totalOutputTokens,
|
||||
totalCalls,
|
||||
tokenLimit,
|
||||
tokenPct,
|
||||
costLimitUsd,
|
||||
outputTokenLimit,
|
||||
outputTokenLimitPct,
|
||||
messageLimit,
|
||||
messagePct,
|
||||
limitKind,
|
||||
resetInMs,
|
||||
resetsAt,
|
||||
timeToLimitMs,
|
||||
safe,
|
||||
tokensPerMinute: Math.round(tokensPerMinute * 100) / 100,
|
||||
outputTokensPerMinute: Math.round(outputTokensPerMinute * 100) / 100,
|
||||
costUsdPerMinute: Math.round(costUsdPerMinute * 1e8) / 1e8,
|
||||
perModel,
|
||||
topSessions,
|
||||
|
|
@ -343,14 +389,33 @@ export function RuntimeUsageSection({
|
|||
<StatCard
|
||||
label="Time to Limit"
|
||||
value={usage.timeToLimitMs === null ? "—" : usage.timeToLimitMs === 0 ? "At limit" : fmtMs(usage.timeToLimitMs)}
|
||||
sub={usage.tokenLimit ? `${usage.tokenPct ?? 0}% of ${fmtTokens(usage.tokenLimit)}` : undefined}
|
||||
sub={(() => {
|
||||
if (usage.outputTokenLimit) {
|
||||
return `${usage.outputTokenLimitPct ?? 0}% of ${fmtTokens(usage.outputTokenLimit)} out`;
|
||||
}
|
||||
if (usage.tokenLimit) {
|
||||
return `${usage.tokenPct ?? 0}% of ${fmtTokens(usage.tokenLimit)}`;
|
||||
}
|
||||
if (usage.messageLimit) {
|
||||
return `${usage.messagePct ?? 0}% of ${usage.messageLimit} msgs`;
|
||||
}
|
||||
return undefined;
|
||||
})()}
|
||||
tone={safenessTone}
|
||||
icon={<TrendingDown className="h-3 w-3" />}
|
||||
/>
|
||||
<StatCard
|
||||
label="Burn Rate"
|
||||
value={usage.costUsdPerMinute > 0 ? `${fmtCost(usage.costUsdPerMinute)}/m` : "—"}
|
||||
sub={usage.tokensPerMinute > 0 ? `${fmtTokens(usage.tokensPerMinute)} tok/m` : undefined}
|
||||
sub={(() => {
|
||||
if (usage.outputTokensPerMinute > 0 && usage.outputTokenLimit) {
|
||||
return `${fmtTokens(usage.outputTokensPerMinute)} out-tok/m`;
|
||||
}
|
||||
if (usage.tokensPerMinute > 0) {
|
||||
return `${fmtTokens(usage.tokensPerMinute)} tok/m`;
|
||||
}
|
||||
return undefined;
|
||||
})()}
|
||||
icon={<Flame className="h-3 w-3" />}
|
||||
/>
|
||||
</div>
|
||||
|
|
@ -374,8 +439,10 @@ export function RuntimeUsageSection({
|
|||
<th className="px-3 py-2 text-right font-semibold text-muted">Tokens out</th>
|
||||
<th className="px-3 py-2 text-right font-semibold text-muted">Cost</th>
|
||||
<th className="px-3 py-2 text-right font-semibold text-muted">Calls</th>
|
||||
{usage.tokenLimit !== null && (
|
||||
<th className="px-3 py-2 text-right font-semibold text-muted">% limit</th>
|
||||
{(usage.outputTokenLimit !== null || usage.tokenLimit !== null) && (
|
||||
<th className="px-3 py-2 text-right font-semibold text-muted">
|
||||
{usage.outputTokenLimit !== null ? "% out limit" : "% limit"}
|
||||
</th>
|
||||
)}
|
||||
</tr>
|
||||
</thead>
|
||||
|
|
@ -396,7 +463,12 @@ export function RuntimeUsageSection({
|
|||
{entry.unpriced ? <span className="text-[color:var(--warning)]">—*</span> : fmtCost(entry.cost_usd)}
|
||||
</td>
|
||||
<td className="px-3 py-2 text-right tabular-nums text-muted">{entry.calls}</td>
|
||||
{usage.tokenLimit !== null && (
|
||||
{usage.outputTokenLimit !== null && (
|
||||
<td className="px-3 py-2 text-right tabular-nums text-muted">
|
||||
{Math.round(entry.output_tokens * 100 / usage.outputTokenLimit)}%
|
||||
</td>
|
||||
)}
|
||||
{usage.outputTokenLimit === null && usage.tokenLimit !== null && (
|
||||
<td className="px-3 py-2 text-right tabular-nums text-muted">
|
||||
{Math.round(entry.total_tokens * 100 / usage.tokenLimit)}%
|
||||
</td>
|
||||
|
|
|
|||
Loading…
Reference in New Issue