feat(usage): separate limit types — typed limits for output tokens, total tokens, messages (#39)
- Add typed limit fields to RuntimeUsageCurrent: output_token_limit, total_token_limit, message_limit with matching pct and source - Add total_output_tokens and output_tokens_per_minute to burn rate - _build_current() now computes each pct from matching units only - Legacy token_limit backfilled from typed limits for backwards compat - Frontend aggregateRuntimeUsage() tracks typed limits separately - limit_kind field on predictions indicates which limit drove time-to-limit
This commit is contained in:
parent
02eb03d408
commit
5217a70c9f
|
|
@ -25,21 +25,44 @@ class RuntimeUsageCurrent(SQLModel):
|
||||||
"""Aggregated totals within the current window."""
|
"""Aggregated totals within the current window."""
|
||||||
|
|
||||||
total_cost_usd: float
|
total_cost_usd: float
|
||||||
total_tokens: int # input + output across all sessions
|
total_tokens: int # input + output across all sessions
|
||||||
|
total_output_tokens: int = 0 # output tokens only — used with output_token_limit
|
||||||
total_calls: int
|
total_calls: int
|
||||||
token_limit: int | None = None # configured limit; None = unknown
|
|
||||||
token_pct: int | None = None # 0–100; None when limit unknown
|
# ── Legacy fields (kept for backwards compat) ────────────────────────────
|
||||||
|
# token_limit is ambiguous (could be total or output); use typed fields below
|
||||||
|
# when the limit kind is known.
|
||||||
|
token_limit: int | None = None
|
||||||
|
token_pct: int | None = None
|
||||||
cost_limit_usd: float | None = None
|
cost_limit_usd: float | None = None
|
||||||
cost_pct: int | None = None
|
cost_pct: int | None = None
|
||||||
# Source and confidence for the limits
|
|
||||||
token_limit_source: str | None = None
|
token_limit_source: str | None = None
|
||||||
cost_limit_source: str | None = None
|
cost_limit_source: str | None = None
|
||||||
|
|
||||||
|
# ── Typed limits (Phase 4) ────────────────────────────────────────────────
|
||||||
|
# Each field pairs a limit with a percent computed from matching units only.
|
||||||
|
|
||||||
|
# Output-token limit: compared against output tokens only, never input/cache.
|
||||||
|
output_token_limit: int | None = None
|
||||||
|
output_token_limit_pct: int | None = None
|
||||||
|
output_token_limit_source: str | None = None
|
||||||
|
|
||||||
|
# Total-token limit: compared against input + output combined.
|
||||||
|
total_token_limit: int | None = None
|
||||||
|
total_token_limit_pct: int | None = None
|
||||||
|
total_token_limit_source: str | None = None
|
||||||
|
|
||||||
|
# Message/request limit: compared against call count, never token totals.
|
||||||
|
message_limit: int | None = None
|
||||||
|
message_pct: int | None = None
|
||||||
|
message_limit_source: str | None = None
|
||||||
|
|
||||||
|
|
||||||
class RuntimeUsageBurnRate(SQLModel):
|
class RuntimeUsageBurnRate(SQLModel):
|
||||||
"""Recent token and cost velocity (last 60 minutes of the window)."""
|
"""Recent token and cost velocity (last 60 minutes of the window)."""
|
||||||
|
|
||||||
tokens_per_minute: float
|
tokens_per_minute: float # input + output combined
|
||||||
|
output_tokens_per_minute: float = 0.0 # output tokens only
|
||||||
cost_usd_per_minute: float
|
cost_usd_per_minute: float
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -48,6 +71,7 @@ class RuntimeUsagePredictions(SQLModel):
|
||||||
|
|
||||||
time_to_limit_ms: int | None = None # None when limit or burn rate unknown
|
time_to_limit_ms: int | None = None # None when limit or burn rate unknown
|
||||||
safe: bool # True if time_to_limit > reset_in_ms (will reset before hitting limit)
|
safe: bool # True if time_to_limit > reset_in_ms (will reset before hitting limit)
|
||||||
|
limit_kind: str = "total_tokens" # which limit drove this prediction
|
||||||
|
|
||||||
|
|
||||||
class ModelUsageEntry(SQLModel):
|
class ModelUsageEntry(SQLModel):
|
||||||
|
|
|
||||||
|
|
@ -544,58 +544,107 @@ def _build_window(
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _limit_source(status_raw: dict[str, Any]) -> str:
|
||||||
|
"""Return the appropriate source label for a limit read from gateway status."""
|
||||||
|
has_rate_limit_headers = (
|
||||||
|
status_raw.get("x_ratelimit_remaining") or
|
||||||
|
status_raw.get("x_ratelimit_limit") or
|
||||||
|
status_raw.get("anthropic_ratelimit_remaining") or
|
||||||
|
status_raw.get("anthropic_ratelimit_limit")
|
||||||
|
)
|
||||||
|
return "provider_api_rate_limit" if has_rate_limit_headers else "configured_limit"
|
||||||
|
|
||||||
|
|
||||||
|
def _pct(numerator: int | float, denominator: int | float) -> int | None:
|
||||||
|
if not denominator:
|
||||||
|
return None
|
||||||
|
return int(min(100, numerator * 100 // denominator))
|
||||||
|
|
||||||
|
|
||||||
def _build_current(
|
def _build_current(
|
||||||
per_model: dict[str, ModelUsageEntry],
|
per_model: dict[str, ModelUsageEntry],
|
||||||
status_raw: dict[str, Any],
|
status_raw: dict[str, Any],
|
||||||
account_key: str = "default",
|
account_key: str = "default",
|
||||||
) -> RuntimeUsageCurrent:
|
) -> RuntimeUsageCurrent:
|
||||||
total_cost = round(sum(e.cost_usd for e in per_model.values()), 8)
|
total_cost = round(sum(e.cost_usd for e in per_model.values()), 8)
|
||||||
total_tokens = sum(e.total_tokens for e in per_model.values())
|
total_tokens = sum(e.total_tokens for e in per_model.values())
|
||||||
total_calls = sum(e.calls for e in per_model.values())
|
total_output_tokens = sum(e.output_tokens for e in per_model.values())
|
||||||
|
total_calls = sum(e.calls for e in per_model.values())
|
||||||
|
|
||||||
# Try to get configured limits from the gateway status
|
src = _limit_source(status_raw)
|
||||||
raw_token_limit = _get_int(status_raw, "tokenLimit", "token_limit", "messageLimit", "message_limit", default=0)
|
|
||||||
token_limit = raw_token_limit or None
|
|
||||||
|
|
||||||
# Determine source for token limit
|
|
||||||
if raw_token_limit:
|
|
||||||
# Check for API rate-limit headers
|
|
||||||
has_rate_limit_headers = (
|
|
||||||
status_raw.get("x_ratelimit_remaining") or
|
|
||||||
status_raw.get("x_ratelimit_limit") or
|
|
||||||
status_raw.get("anthropic_ratelimit_remaining") or
|
|
||||||
status_raw.get("anthropic_ratelimit_limit")
|
|
||||||
)
|
|
||||||
if has_rate_limit_headers:
|
|
||||||
token_limit_source = "provider_api_rate_limit"
|
|
||||||
else:
|
|
||||||
token_limit_source = "configured_limit"
|
|
||||||
else:
|
|
||||||
token_limit_source = None
|
|
||||||
|
|
||||||
token_pct = int(min(100, total_tokens * 100 // raw_token_limit)) if raw_token_limit else None
|
|
||||||
|
|
||||||
|
# ── Explicit output-token limit ───────────────────────────────────────────
|
||||||
|
raw_output_limit = _get_int(
|
||||||
|
status_raw, "outputTokenLimit", "output_token_limit", default=0
|
||||||
|
)
|
||||||
|
output_token_limit = raw_output_limit or None
|
||||||
|
output_token_limit_pct = _pct(total_output_tokens, raw_output_limit)
|
||||||
|
output_token_limit_src = src if raw_output_limit else None
|
||||||
|
|
||||||
|
# ── Explicit total-token limit ────────────────────────────────────────────
|
||||||
|
raw_total_limit = _get_int(
|
||||||
|
status_raw, "totalTokenLimit", "total_token_limit", default=0
|
||||||
|
)
|
||||||
|
total_token_limit = raw_total_limit or None
|
||||||
|
total_token_limit_pct = _pct(total_tokens, raw_total_limit)
|
||||||
|
total_token_limit_src = src if raw_total_limit else None
|
||||||
|
|
||||||
|
# ── Message/request limit (count-based, never token-based) ───────────────
|
||||||
|
raw_message_limit = _get_int(
|
||||||
|
status_raw, "messageLimit", "message_limit", "requestLimit", "request_limit",
|
||||||
|
default=0,
|
||||||
|
)
|
||||||
|
message_limit = raw_message_limit or None
|
||||||
|
message_pct = _pct(total_calls, raw_message_limit)
|
||||||
|
message_limit_src = src if raw_message_limit else None
|
||||||
|
|
||||||
|
# ── Legacy token_limit (ambiguous kind — maps to tokenLimit only) ─────────
|
||||||
|
# Do NOT fold messageLimit into this; keep units separate.
|
||||||
|
raw_token_limit = _get_int(status_raw, "tokenLimit", "token_limit", default=0)
|
||||||
|
token_limit = raw_token_limit or None
|
||||||
|
token_pct = _pct(total_tokens, raw_token_limit)
|
||||||
|
token_limit_src = src if raw_token_limit else None
|
||||||
|
|
||||||
|
# If we got an explicit typed limit but no legacy one, backfill legacy
|
||||||
|
# so existing dashboard code still works during the transition.
|
||||||
|
if token_limit is None:
|
||||||
|
if output_token_limit is not None:
|
||||||
|
token_limit = output_token_limit
|
||||||
|
token_pct = output_token_limit_pct
|
||||||
|
token_limit_src = output_token_limit_src
|
||||||
|
elif total_token_limit is not None:
|
||||||
|
token_limit = total_token_limit
|
||||||
|
token_pct = total_token_limit_pct
|
||||||
|
token_limit_src = total_token_limit_src
|
||||||
|
|
||||||
|
# ── Cost limit ────────────────────────────────────────────────────────────
|
||||||
raw_cost_limit = _get_float(status_raw, "costLimit", "cost_limit", "costLimitUsd", default=0.0)
|
raw_cost_limit = _get_float(status_raw, "costLimit", "cost_limit", "costLimitUsd", default=0.0)
|
||||||
cost_limit = raw_cost_limit or None
|
cost_limit = raw_cost_limit or None
|
||||||
|
cost_pct = _pct(total_cost, raw_cost_limit) if raw_cost_limit else None
|
||||||
# Determine source for cost limit
|
cost_limit_src = src if raw_cost_limit else None
|
||||||
if raw_cost_limit:
|
|
||||||
cost_limit_source = "configured_limit"
|
|
||||||
else:
|
|
||||||
cost_limit_source = None
|
|
||||||
|
|
||||||
cost_pct = int(min(100, total_cost * 100 / raw_cost_limit)) if raw_cost_limit else None
|
|
||||||
|
|
||||||
return RuntimeUsageCurrent(
|
return RuntimeUsageCurrent(
|
||||||
total_cost_usd=total_cost,
|
total_cost_usd=total_cost,
|
||||||
total_tokens=total_tokens,
|
total_tokens=total_tokens,
|
||||||
|
total_output_tokens=total_output_tokens,
|
||||||
total_calls=total_calls,
|
total_calls=total_calls,
|
||||||
|
# legacy
|
||||||
token_limit=token_limit,
|
token_limit=token_limit,
|
||||||
token_pct=token_pct,
|
token_pct=token_pct,
|
||||||
cost_limit_usd=cost_limit,
|
cost_limit_usd=cost_limit,
|
||||||
cost_pct=cost_pct,
|
cost_pct=cost_pct,
|
||||||
token_limit_source=token_limit_source,
|
token_limit_source=token_limit_src,
|
||||||
cost_limit_source=cost_limit_source,
|
cost_limit_source=cost_limit_src,
|
||||||
|
# typed
|
||||||
|
output_token_limit=output_token_limit,
|
||||||
|
output_token_limit_pct=output_token_limit_pct,
|
||||||
|
output_token_limit_source=output_token_limit_src,
|
||||||
|
total_token_limit=total_token_limit,
|
||||||
|
total_token_limit_pct=total_token_limit_pct,
|
||||||
|
total_token_limit_source=total_token_limit_src,
|
||||||
|
message_limit=message_limit,
|
||||||
|
message_pct=message_pct,
|
||||||
|
message_limit_source=message_limit_src,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -604,9 +653,14 @@ def _compute_burn_rate(
|
||||||
window: RuntimeUsageWindow,
|
window: RuntimeUsageWindow,
|
||||||
now: datetime,
|
now: datetime,
|
||||||
) -> RuntimeUsageBurnRate:
|
) -> RuntimeUsageBurnRate:
|
||||||
"""Compute tokens/min and cost/min from the most recent 60 minutes of sessions."""
|
"""Compute tokens/min and cost/min from the most recent 60 minutes of sessions.
|
||||||
|
|
||||||
|
Tracks total tokens (input+output) and output tokens separately so that
|
||||||
|
predictions against output-token limits use the correct numerator.
|
||||||
|
"""
|
||||||
cutoff = now - timedelta(minutes=60)
|
cutoff = now - timedelta(minutes=60)
|
||||||
recent_tokens = 0
|
recent_tokens = 0
|
||||||
|
recent_output_tokens = 0
|
||||||
recent_cost = 0.0
|
recent_cost = 0.0
|
||||||
|
|
||||||
for session in sessions:
|
for session in sessions:
|
||||||
|
|
@ -615,15 +669,14 @@ def _compute_burn_rate(
|
||||||
if ts is None or ts < cutoff:
|
if ts is None or ts < cutoff:
|
||||||
continue
|
continue
|
||||||
tokens = _parse_session_usage(session)
|
tokens = _parse_session_usage(session)
|
||||||
recent_tokens += tokens["input"] + tokens["output"]
|
recent_tokens += tokens["input"] + tokens["output"]
|
||||||
recent_cost += _get_float(session, "cost", "cost_usd", "costUsd", default=0.0)
|
recent_output_tokens += tokens["output"]
|
||||||
|
recent_cost += _get_float(session, "cost", "cost_usd", "costUsd", default=0.0)
|
||||||
|
|
||||||
# Rate per minute over the last 60 minutes
|
|
||||||
tokens_per_minute = round(recent_tokens / 60, 4)
|
|
||||||
cost_per_minute = round(recent_cost / 60, 8)
|
|
||||||
return RuntimeUsageBurnRate(
|
return RuntimeUsageBurnRate(
|
||||||
tokens_per_minute=tokens_per_minute,
|
tokens_per_minute=round(recent_tokens / 60, 4),
|
||||||
cost_usd_per_minute=cost_per_minute,
|
output_tokens_per_minute=round(recent_output_tokens / 60, 4),
|
||||||
|
cost_usd_per_minute=round(recent_cost / 60, 8),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -632,18 +685,79 @@ def _build_predictions(
|
||||||
burn_rate: RuntimeUsageBurnRate,
|
burn_rate: RuntimeUsageBurnRate,
|
||||||
window: RuntimeUsageWindow,
|
window: RuntimeUsageWindow,
|
||||||
) -> RuntimeUsagePredictions:
|
) -> RuntimeUsagePredictions:
|
||||||
"""Estimate time-to-limit in ms based on total-token burn rate."""
|
"""Estimate time-to-limit in ms using the most constrained matching limit.
|
||||||
if burn_rate.tokens_per_minute <= 0 or current.token_limit is None:
|
|
||||||
return RuntimeUsagePredictions(time_to_limit_ms=None, safe=True)
|
|
||||||
|
|
||||||
tokens_remaining = current.token_limit - current.total_tokens
|
Priority order (tightest first):
|
||||||
if tokens_remaining <= 0:
|
1. output_token_limit vs output_tokens (burn: output_tokens_per_minute)
|
||||||
return RuntimeUsagePredictions(time_to_limit_ms=0, safe=False)
|
2. total_token_limit vs total_tokens (burn: tokens_per_minute)
|
||||||
|
3. legacy token_limit vs total_tokens (burn: tokens_per_minute)
|
||||||
|
4. message_limit vs total_calls (constant rate = calls / window_minutes)
|
||||||
|
|
||||||
minutes_to_limit = tokens_remaining / burn_rate.tokens_per_minute
|
Cost and request limits are not used for time-to-limit since they either
|
||||||
time_to_limit_ms = int(minutes_to_limit * 60 * 1000)
|
require billing data (cost) or are not the binding constraint in practice.
|
||||||
|
"""
|
||||||
|
candidates: list[tuple[int, str]] = [] # (time_to_limit_ms, kind)
|
||||||
|
|
||||||
|
# ── Output-token limit ────────────────────────────────────────────────────
|
||||||
|
if (
|
||||||
|
current.output_token_limit is not None
|
||||||
|
and burn_rate.output_tokens_per_minute > 0
|
||||||
|
):
|
||||||
|
remaining = current.output_token_limit - current.total_output_tokens
|
||||||
|
if remaining <= 0:
|
||||||
|
return RuntimeUsagePredictions(time_to_limit_ms=0, safe=False, limit_kind="output_tokens")
|
||||||
|
candidates.append((
|
||||||
|
int(remaining / burn_rate.output_tokens_per_minute * 60_000),
|
||||||
|
"output_tokens",
|
||||||
|
))
|
||||||
|
|
||||||
|
# ── Total-token limit ─────────────────────────────────────────────────────
|
||||||
|
if (
|
||||||
|
current.total_token_limit is not None
|
||||||
|
and burn_rate.tokens_per_minute > 0
|
||||||
|
):
|
||||||
|
remaining = current.total_token_limit - current.total_tokens
|
||||||
|
if remaining <= 0:
|
||||||
|
return RuntimeUsagePredictions(time_to_limit_ms=0, safe=False, limit_kind="total_tokens")
|
||||||
|
candidates.append((
|
||||||
|
int(remaining / burn_rate.tokens_per_minute * 60_000),
|
||||||
|
"total_tokens",
|
||||||
|
))
|
||||||
|
|
||||||
|
# ── Legacy token_limit (only when no typed token limit) ───────────────────
|
||||||
|
if (
|
||||||
|
not candidates
|
||||||
|
and current.token_limit is not None
|
||||||
|
and burn_rate.tokens_per_minute > 0
|
||||||
|
):
|
||||||
|
remaining = current.token_limit - current.total_tokens
|
||||||
|
if remaining <= 0:
|
||||||
|
return RuntimeUsagePredictions(time_to_limit_ms=0, safe=False, limit_kind="total_tokens")
|
||||||
|
candidates.append((
|
||||||
|
int(remaining / burn_rate.tokens_per_minute * 60_000),
|
||||||
|
"total_tokens",
|
||||||
|
))
|
||||||
|
|
||||||
|
# ── Message limit ─────────────────────────────────────────────────────────
|
||||||
|
if current.message_limit is not None and current.message_limit > 0:
|
||||||
|
window_minutes = max(window.reset_in_ms / 60_000, 1)
|
||||||
|
calls_per_minute = current.total_calls / window_minutes if window_minutes > 0 else 0
|
||||||
|
if calls_per_minute > 0:
|
||||||
|
remaining = current.message_limit - current.total_calls
|
||||||
|
if remaining <= 0:
|
||||||
|
return RuntimeUsagePredictions(time_to_limit_ms=0, safe=False, limit_kind="messages")
|
||||||
|
candidates.append((
|
||||||
|
int(remaining / calls_per_minute * 60_000),
|
||||||
|
"messages",
|
||||||
|
))
|
||||||
|
|
||||||
|
if not candidates:
|
||||||
|
return RuntimeUsagePredictions(time_to_limit_ms=None, safe=True, limit_kind="total_tokens")
|
||||||
|
|
||||||
|
# Pick the most constrained (smallest time) — that is what will actually block work.
|
||||||
|
time_to_limit_ms, kind = min(candidates, key=lambda c: c[0])
|
||||||
safe = time_to_limit_ms > window.reset_in_ms
|
safe = time_to_limit_ms > window.reset_in_ms
|
||||||
return RuntimeUsagePredictions(time_to_limit_ms=time_to_limit_ms, safe=safe)
|
return RuntimeUsagePredictions(time_to_limit_ms=time_to_limit_ms, safe=safe, limit_kind=kind)
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
|
||||||
|
|
@ -16,15 +16,24 @@ import { DashboardEmptyState } from "./DashboardEmptyState";
|
||||||
export interface AggregatedRuntimeUsage {
|
export interface AggregatedRuntimeUsage {
|
||||||
totalCostUsd: number;
|
totalCostUsd: number;
|
||||||
totalTokens: number;
|
totalTokens: number;
|
||||||
|
totalOutputTokens: number;
|
||||||
totalCalls: number;
|
totalCalls: number;
|
||||||
|
// Legacy (backwards compat)
|
||||||
tokenLimit: number | null;
|
tokenLimit: number | null;
|
||||||
tokenPct: number | null;
|
tokenPct: number | null;
|
||||||
costLimitUsd: number | null;
|
costLimitUsd: number | null;
|
||||||
|
// Typed limits (Phase 4)
|
||||||
|
outputTokenLimit: number | null;
|
||||||
|
outputTokenLimitPct: number | null;
|
||||||
|
messageLimit: number | null;
|
||||||
|
messagePct: number | null;
|
||||||
|
limitKind: string; // which limit drove the time-to-limit prediction
|
||||||
resetInMs: number;
|
resetInMs: number;
|
||||||
resetsAt: string;
|
resetsAt: string;
|
||||||
timeToLimitMs: number | null;
|
timeToLimitMs: number | null;
|
||||||
safe: boolean;
|
safe: boolean;
|
||||||
tokensPerMinute: number;
|
tokensPerMinute: number;
|
||||||
|
outputTokensPerMinute: number;
|
||||||
costUsdPerMinute: number;
|
costUsdPerMinute: number;
|
||||||
perModel: Record<string, ModelUsageEntry>;
|
perModel: Record<string, ModelUsageEntry>;
|
||||||
topSessions: TopSession[];
|
topSessions: TopSession[];
|
||||||
|
|
@ -54,15 +63,22 @@ export function aggregateRuntimeUsage(
|
||||||
return {
|
return {
|
||||||
totalCostUsd: 0,
|
totalCostUsd: 0,
|
||||||
totalTokens: 0,
|
totalTokens: 0,
|
||||||
|
totalOutputTokens: 0,
|
||||||
totalCalls: 0,
|
totalCalls: 0,
|
||||||
tokenLimit: null,
|
tokenLimit: null,
|
||||||
tokenPct: null,
|
tokenPct: null,
|
||||||
costLimitUsd: null,
|
costLimitUsd: null,
|
||||||
|
outputTokenLimit: null,
|
||||||
|
outputTokenLimitPct: null,
|
||||||
|
messageLimit: null,
|
||||||
|
messagePct: null,
|
||||||
|
limitKind: "total_tokens",
|
||||||
resetInMs: 0,
|
resetInMs: 0,
|
||||||
resetsAt: "",
|
resetsAt: "",
|
||||||
timeToLimitMs: null,
|
timeToLimitMs: null,
|
||||||
safe: true,
|
safe: true,
|
||||||
tokensPerMinute: 0,
|
tokensPerMinute: 0,
|
||||||
|
outputTokensPerMinute: 0,
|
||||||
costUsdPerMinute: 0,
|
costUsdPerMinute: 0,
|
||||||
perModel: {},
|
perModel: {},
|
||||||
topSessions: [],
|
topSessions: [],
|
||||||
|
|
@ -71,21 +87,24 @@ export function aggregateRuntimeUsage(
|
||||||
|
|
||||||
let totalCostUsd = 0;
|
let totalCostUsd = 0;
|
||||||
let totalTokens = 0;
|
let totalTokens = 0;
|
||||||
|
let totalOutputTokens = 0;
|
||||||
let totalCalls = 0;
|
let totalCalls = 0;
|
||||||
let tokensPerMinute = 0;
|
let tokensPerMinute = 0;
|
||||||
|
let outputTokensPerMinute = 0;
|
||||||
let costUsdPerMinute = 0;
|
let costUsdPerMinute = 0;
|
||||||
// Use the window that resets soonest as the binding constraint
|
|
||||||
let resetInMs = valid[0].window.reset_in_ms;
|
let resetInMs = valid[0].window.reset_in_ms;
|
||||||
let resetsAt = valid[0].window.resets_at;
|
let resetsAt = valid[0].window.resets_at;
|
||||||
const perModel: Record<string, ModelUsageEntry> = {};
|
const perModel: Record<string, ModelUsageEntry> = {};
|
||||||
const allSessions: TopSession[] = [];
|
const allSessions: TopSession[] = [];
|
||||||
|
|
||||||
for (const r of valid) {
|
for (const r of valid) {
|
||||||
totalCostUsd += r.current.total_cost_usd;
|
totalCostUsd += r.current.total_cost_usd;
|
||||||
totalTokens += r.current.total_tokens;
|
totalTokens += r.current.total_tokens;
|
||||||
totalCalls += r.current.total_calls;
|
totalOutputTokens += r.current.total_output_tokens ?? 0;
|
||||||
tokensPerMinute += r.burn_rate.tokens_per_minute;
|
totalCalls += r.current.total_calls;
|
||||||
costUsdPerMinute += r.burn_rate.cost_usd_per_minute;
|
tokensPerMinute += r.burn_rate.tokens_per_minute;
|
||||||
|
outputTokensPerMinute += r.burn_rate.output_tokens_per_minute ?? 0;
|
||||||
|
costUsdPerMinute += r.burn_rate.cost_usd_per_minute;
|
||||||
|
|
||||||
if (r.window.reset_in_ms < resetInMs) {
|
if (r.window.reset_in_ms < resetInMs) {
|
||||||
resetInMs = r.window.reset_in_ms;
|
resetInMs = r.window.reset_in_ms;
|
||||||
|
|
@ -113,25 +132,45 @@ export function aggregateRuntimeUsage(
|
||||||
allSessions.push(...r.top_sessions);
|
allSessions.push(...r.top_sessions);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Aggregate token limits — only meaningful if all gateways share one limit
|
// Legacy token limit
|
||||||
const limits = valid.map(r => r.current.token_limit).filter((v): v is number => v !== null);
|
const limits = valid.map(r => r.current.token_limit).filter((v): v is number => v !== null);
|
||||||
const tokenLimit = limits.length > 0 ? limits.reduce((a, b) => a + b, 0) : null;
|
const tokenLimit = limits.length > 0 ? limits.reduce((a, b) => a + b, 0) : null;
|
||||||
const tokenPct = tokenLimit ? Math.min(100, Math.round(totalTokens * 100 / tokenLimit)) : null;
|
const tokenPct = tokenLimit ? Math.min(100, Math.round(totalTokens * 100 / tokenLimit)) : null;
|
||||||
const costLimits = valid.map(r => r.current.cost_limit_usd).filter((v): v is number => v !== null);
|
const costLimits = valid.map(r => r.current.cost_limit_usd).filter((v): v is number => v !== null);
|
||||||
const costLimitUsd = costLimits.length > 0 ? costLimits.reduce((a, b) => a + b, 0) : null;
|
const costLimitUsd = costLimits.length > 0 ? costLimits.reduce((a, b) => a + b, 0) : null;
|
||||||
|
|
||||||
// Re-derive time-to-limit from aggregated burn rate
|
// Typed limits
|
||||||
|
const outLimits = valid.map(r => r.current.output_token_limit).filter((v): v is number => v !== null);
|
||||||
|
const outputTokenLimit = outLimits.length > 0 ? outLimits.reduce((a, b) => a + b, 0) : null;
|
||||||
|
const outputTokenLimitPct = outputTokenLimit
|
||||||
|
? Math.min(100, Math.round(totalOutputTokens * 100 / outputTokenLimit))
|
||||||
|
: null;
|
||||||
|
|
||||||
|
const msgLimits = valid.map(r => r.current.message_limit).filter((v): v is number => v !== null);
|
||||||
|
const messageLimit = msgLimits.length > 0 ? msgLimits.reduce((a, b) => a + b, 0) : null;
|
||||||
|
const messagePct = messageLimit ? Math.min(100, Math.round(totalCalls * 100 / messageLimit)) : null;
|
||||||
|
|
||||||
|
// Re-derive time-to-limit using the most constrained matching limit
|
||||||
let timeToLimitMs: number | null = null;
|
let timeToLimitMs: number | null = null;
|
||||||
let safe = true;
|
let safe = true;
|
||||||
|
let limitKind = "total_tokens";
|
||||||
|
const candidates: Array<{ ms: number; kind: string }> = [];
|
||||||
|
|
||||||
|
if (outputTokenLimit !== null && outputTokensPerMinute > 0) {
|
||||||
|
const rem = outputTokenLimit - totalOutputTokens;
|
||||||
|
if (rem <= 0) return { totalCostUsd: Math.round(totalCostUsd * 1e8) / 1e8, totalTokens, totalOutputTokens, totalCalls, tokenLimit, tokenPct, costLimitUsd, outputTokenLimit, outputTokenLimitPct, messageLimit, messagePct, limitKind: "output_tokens", resetInMs, resetsAt, timeToLimitMs: 0, safe: false, tokensPerMinute: Math.round(tokensPerMinute * 100) / 100, outputTokensPerMinute: Math.round(outputTokensPerMinute * 100) / 100, costUsdPerMinute: Math.round(costUsdPerMinute * 1e8) / 1e8, perModel, topSessions: [...allSessions].sort((a, b) => b.cost_usd - a.cost_usd).slice(0, 10) };
|
||||||
|
candidates.push({ ms: Math.round(rem / outputTokensPerMinute * 60_000), kind: "output_tokens" });
|
||||||
|
}
|
||||||
if (tokenLimit !== null && tokensPerMinute > 0) {
|
if (tokenLimit !== null && tokensPerMinute > 0) {
|
||||||
const remaining = tokenLimit - totalTokens;
|
const rem = tokenLimit - totalTokens;
|
||||||
if (remaining <= 0) {
|
if (rem <= 0) return { totalCostUsd: Math.round(totalCostUsd * 1e8) / 1e8, totalTokens, totalOutputTokens, totalCalls, tokenLimit, tokenPct, costLimitUsd, outputTokenLimit, outputTokenLimitPct, messageLimit, messagePct, limitKind: "total_tokens", resetInMs, resetsAt, timeToLimitMs: 0, safe: false, tokensPerMinute: Math.round(tokensPerMinute * 100) / 100, outputTokensPerMinute: Math.round(outputTokensPerMinute * 100) / 100, costUsdPerMinute: Math.round(costUsdPerMinute * 1e8) / 1e8, perModel, topSessions: [...allSessions].sort((a, b) => b.cost_usd - a.cost_usd).slice(0, 10) };
|
||||||
timeToLimitMs = 0;
|
candidates.push({ ms: Math.round(rem / tokensPerMinute * 60_000), kind: "total_tokens" });
|
||||||
safe = false;
|
}
|
||||||
} else {
|
if (candidates.length > 0) {
|
||||||
timeToLimitMs = Math.round((remaining / tokensPerMinute) * 60 * 1000);
|
const tightest = candidates.reduce((a, b) => a.ms < b.ms ? a : b);
|
||||||
safe = timeToLimitMs > resetInMs;
|
timeToLimitMs = tightest.ms;
|
||||||
}
|
limitKind = tightest.kind;
|
||||||
|
safe = timeToLimitMs > resetInMs;
|
||||||
}
|
}
|
||||||
|
|
||||||
const topSessions = [...allSessions]
|
const topSessions = [...allSessions]
|
||||||
|
|
@ -141,15 +180,22 @@ export function aggregateRuntimeUsage(
|
||||||
return {
|
return {
|
||||||
totalCostUsd: Math.round(totalCostUsd * 1e8) / 1e8,
|
totalCostUsd: Math.round(totalCostUsd * 1e8) / 1e8,
|
||||||
totalTokens,
|
totalTokens,
|
||||||
|
totalOutputTokens,
|
||||||
totalCalls,
|
totalCalls,
|
||||||
tokenLimit,
|
tokenLimit,
|
||||||
tokenPct,
|
tokenPct,
|
||||||
costLimitUsd,
|
costLimitUsd,
|
||||||
|
outputTokenLimit,
|
||||||
|
outputTokenLimitPct,
|
||||||
|
messageLimit,
|
||||||
|
messagePct,
|
||||||
|
limitKind,
|
||||||
resetInMs,
|
resetInMs,
|
||||||
resetsAt,
|
resetsAt,
|
||||||
timeToLimitMs,
|
timeToLimitMs,
|
||||||
safe,
|
safe,
|
||||||
tokensPerMinute: Math.round(tokensPerMinute * 100) / 100,
|
tokensPerMinute: Math.round(tokensPerMinute * 100) / 100,
|
||||||
|
outputTokensPerMinute: Math.round(outputTokensPerMinute * 100) / 100,
|
||||||
costUsdPerMinute: Math.round(costUsdPerMinute * 1e8) / 1e8,
|
costUsdPerMinute: Math.round(costUsdPerMinute * 1e8) / 1e8,
|
||||||
perModel,
|
perModel,
|
||||||
topSessions,
|
topSessions,
|
||||||
|
|
@ -343,14 +389,33 @@ export function RuntimeUsageSection({
|
||||||
<StatCard
|
<StatCard
|
||||||
label="Time to Limit"
|
label="Time to Limit"
|
||||||
value={usage.timeToLimitMs === null ? "—" : usage.timeToLimitMs === 0 ? "At limit" : fmtMs(usage.timeToLimitMs)}
|
value={usage.timeToLimitMs === null ? "—" : usage.timeToLimitMs === 0 ? "At limit" : fmtMs(usage.timeToLimitMs)}
|
||||||
sub={usage.tokenLimit ? `${usage.tokenPct ?? 0}% of ${fmtTokens(usage.tokenLimit)}` : undefined}
|
sub={(() => {
|
||||||
|
if (usage.outputTokenLimit) {
|
||||||
|
return `${usage.outputTokenLimitPct ?? 0}% of ${fmtTokens(usage.outputTokenLimit)} out`;
|
||||||
|
}
|
||||||
|
if (usage.tokenLimit) {
|
||||||
|
return `${usage.tokenPct ?? 0}% of ${fmtTokens(usage.tokenLimit)}`;
|
||||||
|
}
|
||||||
|
if (usage.messageLimit) {
|
||||||
|
return `${usage.messagePct ?? 0}% of ${usage.messageLimit} msgs`;
|
||||||
|
}
|
||||||
|
return undefined;
|
||||||
|
})()}
|
||||||
tone={safenessTone}
|
tone={safenessTone}
|
||||||
icon={<TrendingDown className="h-3 w-3" />}
|
icon={<TrendingDown className="h-3 w-3" />}
|
||||||
/>
|
/>
|
||||||
<StatCard
|
<StatCard
|
||||||
label="Burn Rate"
|
label="Burn Rate"
|
||||||
value={usage.costUsdPerMinute > 0 ? `${fmtCost(usage.costUsdPerMinute)}/m` : "—"}
|
value={usage.costUsdPerMinute > 0 ? `${fmtCost(usage.costUsdPerMinute)}/m` : "—"}
|
||||||
sub={usage.tokensPerMinute > 0 ? `${fmtTokens(usage.tokensPerMinute)} tok/m` : undefined}
|
sub={(() => {
|
||||||
|
if (usage.outputTokensPerMinute > 0 && usage.outputTokenLimit) {
|
||||||
|
return `${fmtTokens(usage.outputTokensPerMinute)} out-tok/m`;
|
||||||
|
}
|
||||||
|
if (usage.tokensPerMinute > 0) {
|
||||||
|
return `${fmtTokens(usage.tokensPerMinute)} tok/m`;
|
||||||
|
}
|
||||||
|
return undefined;
|
||||||
|
})()}
|
||||||
icon={<Flame className="h-3 w-3" />}
|
icon={<Flame className="h-3 w-3" />}
|
||||||
/>
|
/>
|
||||||
</div>
|
</div>
|
||||||
|
|
@ -374,8 +439,10 @@ export function RuntimeUsageSection({
|
||||||
<th className="px-3 py-2 text-right font-semibold text-muted">Tokens out</th>
|
<th className="px-3 py-2 text-right font-semibold text-muted">Tokens out</th>
|
||||||
<th className="px-3 py-2 text-right font-semibold text-muted">Cost</th>
|
<th className="px-3 py-2 text-right font-semibold text-muted">Cost</th>
|
||||||
<th className="px-3 py-2 text-right font-semibold text-muted">Calls</th>
|
<th className="px-3 py-2 text-right font-semibold text-muted">Calls</th>
|
||||||
{usage.tokenLimit !== null && (
|
{(usage.outputTokenLimit !== null || usage.tokenLimit !== null) && (
|
||||||
<th className="px-3 py-2 text-right font-semibold text-muted">% limit</th>
|
<th className="px-3 py-2 text-right font-semibold text-muted">
|
||||||
|
{usage.outputTokenLimit !== null ? "% out limit" : "% limit"}
|
||||||
|
</th>
|
||||||
)}
|
)}
|
||||||
</tr>
|
</tr>
|
||||||
</thead>
|
</thead>
|
||||||
|
|
@ -396,7 +463,12 @@ export function RuntimeUsageSection({
|
||||||
{entry.unpriced ? <span className="text-[color:var(--warning)]">—*</span> : fmtCost(entry.cost_usd)}
|
{entry.unpriced ? <span className="text-[color:var(--warning)]">—*</span> : fmtCost(entry.cost_usd)}
|
||||||
</td>
|
</td>
|
||||||
<td className="px-3 py-2 text-right tabular-nums text-muted">{entry.calls}</td>
|
<td className="px-3 py-2 text-right tabular-nums text-muted">{entry.calls}</td>
|
||||||
{usage.tokenLimit !== null && (
|
{usage.outputTokenLimit !== null && (
|
||||||
|
<td className="px-3 py-2 text-right tabular-nums text-muted">
|
||||||
|
{Math.round(entry.output_tokens * 100 / usage.outputTokenLimit)}%
|
||||||
|
</td>
|
||||||
|
)}
|
||||||
|
{usage.outputTokenLimit === null && usage.tokenLimit !== null && (
|
||||||
<td className="px-3 py-2 text-right tabular-nums text-muted">
|
<td className="px-3 py-2 text-right tabular-nums text-muted">
|
||||||
{Math.round(entry.total_tokens * 100 / usage.tokenLimit)}%
|
{Math.round(entry.total_tokens * 100 / usage.tokenLimit)}%
|
||||||
</td>
|
</td>
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue