diff --git a/backend/app/api/provider_credentials.py b/backend/app/api/provider_credentials.py index bbdb7d5..8eee9a6 100644 --- a/backend/app/api/provider_credentials.py +++ b/backend/app/api/provider_credentials.py @@ -167,6 +167,7 @@ async def test_provider_credential( error=live.error, tokens=_tok(live.tokens), input_tokens=_tok(live.input_tokens), + output_tokens=_tok(live.output_tokens), requests=_req(live.requests), models=live.models, sample_model=live.sample_model, @@ -271,6 +272,7 @@ async def get_provider_usage_live( error=live.error, tokens=_tok(live.tokens), input_tokens=_tok(live.input_tokens), + output_tokens=_tok(live.output_tokens), requests=_req(live.requests), models=live.models, sample_model=live.sample_model, diff --git a/backend/app/schemas/provider_credentials.py b/backend/app/schemas/provider_credentials.py index e35e49d..8980c6b 100644 --- a/backend/app/schemas/provider_credentials.py +++ b/backend/app/schemas/provider_credentials.py @@ -60,7 +60,8 @@ class ProviderUsageLiveRead(SQLModel): confidence: str # high | medium | low error: str | None = None tokens: TokenWindowRead - input_tokens: TokenWindowRead # Anthropic splits input tokens separately + input_tokens: TokenWindowRead # Anthropic input-only window + output_tokens: TokenWindowRead # Anthropic output-only window requests: RequestWindowRead models: list[str] = [] sample_model: str | None = None diff --git a/backend/app/services/provider_usage.py b/backend/app/services/provider_usage.py index 683e86f..c9f911c 100644 --- a/backend/app/services/provider_usage.py +++ b/backend/app/services/provider_usage.py @@ -110,7 +110,8 @@ class ProviderUsageLive: confidence: str = "high" error: str | None = None tokens: TokenWindow = field(default_factory=TokenWindow) - input_tokens: TokenWindow = field(default_factory=TokenWindow) # Anthropic splits input/output + input_tokens: TokenWindow = field(default_factory=TokenWindow) # Anthropic input-only window + output_tokens: TokenWindow = field(default_factory=TokenWindow) # Anthropic output-only window requests: RequestWindow = field(default_factory=RequestWindow) models: list[str] = field(default_factory=list) # model IDs available on this key raw_headers: dict[str, str] = field(default_factory=dict) @@ -143,9 +144,10 @@ class ProviderUsageLive: "source": self.source, "confidence": self.confidence, "error": self.error, - "tokens": _window(self.tokens), - "input_tokens": _window(self.input_tokens), - "requests": _window(self.requests), + "tokens": _window(self.tokens), + "input_tokens": _window(self.input_tokens), + "output_tokens": _window(self.output_tokens), + "requests": _window(self.requests), "models": self.models[:20], # cap for response size "sample_model": self.sample_model, "sample_input_tokens": self.sample_input_tokens, @@ -201,6 +203,11 @@ def _apply_anthropic_ratelimit_headers(result: ProviderUsageLive, headers: dict[ remaining=_parse_int_header(headers, "anthropic-ratelimit-input-tokens-remaining"), reset_at=_parse_iso_reset(headers.get("anthropic-ratelimit-input-tokens-reset", "")), ) + result.output_tokens = TokenWindow( + limit=_parse_int_header(headers, "anthropic-ratelimit-output-tokens-limit"), + remaining=_parse_int_header(headers, "anthropic-ratelimit-output-tokens-remaining"), + reset_at=_parse_iso_reset(headers.get("anthropic-ratelimit-output-tokens-reset", "")), + ) result.requests = RequestWindow( limit=_parse_int_header(headers, "anthropic-ratelimit-requests-limit"), remaining=_parse_int_header(headers, "anthropic-ratelimit-requests-remaining"), @@ -463,13 +470,14 @@ async def _fetch_openai(api_key: str, base_url: str | None) -> ProviderUsageLive probe_model = _pick_openai_probe_model(result.models) if probe_model: result.sample_model = probe_model + # min 16 tokens: gpt-5.x models reject max_output_tokens < 16 probe_endpoints: list[tuple[str, dict[str, Any]]] = [ ( f"{base}/v1/responses", { "model": probe_model, "input": "Usage probe", - "max_output_tokens": 1, + "max_output_tokens": 16, }, ), ( @@ -477,7 +485,7 @@ async def _fetch_openai(api_key: str, base_url: str | None) -> ProviderUsageLive { "model": probe_model, "messages": [{"role": "user", "content": "Usage probe"}], - "max_tokens": 1, + "max_tokens": 16, }, ), ] @@ -494,6 +502,18 @@ async def _fetch_openai(api_key: str, base_url: str | None) -> ProviderUsageLive ) except Exception: continue + # Quota exhaustion is a 429 with a distinct error code — surface it + # clearly rather than treating it as a transient rate limit. + if probe_resp.status_code == 429: + try: + err = probe_resp.json().get("error", {}) + if err.get("code") == "insufficient_quota" or "exceeded your current quota" in str(err.get("message", "")): + result.error = "Quota exhausted — add credits at platform.openai.com/billing." + elapsed_ms = probe_resp.elapsed.total_seconds() * 1000.0 + result.sample_latency_ms = int(max(0.0, round(elapsed_ms))) + break + except Exception: + pass probe_headers = {k.lower(): v for k, v in probe_resp.headers.items()} probe_rl_headers = {k: v for k, v in probe_headers.items() if "ratelimit" in k} if probe_rl_headers: