diff --git a/backend/app/api/provider_credentials.py b/backend/app/api/provider_credentials.py index 2734639..1a4db21 100644 --- a/backend/app/api/provider_credentials.py +++ b/backend/app/api/provider_credentials.py @@ -167,6 +167,10 @@ async def test_provider_credential( input_tokens=_tok(live.input_tokens), requests=_req(live.requests), models=live.models, + sample_model=live.sample_model, + sample_input_tokens=live.sample_input_tokens, + sample_output_tokens=live.sample_output_tokens, + sample_latency_ms=live.sample_latency_ms, debug_rate_limit_headers=sorted(live.raw_headers.keys()) if live.raw_headers else None, ) @@ -265,6 +269,10 @@ async def get_provider_usage_live( input_tokens=_tok(live.input_tokens), requests=_req(live.requests), models=live.models, + sample_model=live.sample_model, + sample_input_tokens=live.sample_input_tokens, + sample_output_tokens=live.sample_output_tokens, + sample_latency_ms=live.sample_latency_ms, debug_rate_limit_headers=sorted(live.raw_headers.keys()) if live.raw_headers else None, ) diff --git a/backend/app/models/gateways.py b/backend/app/models/gateways.py index 3451a10..68e6776 100644 --- a/backend/app/models/gateways.py +++ b/backend/app/models/gateways.py @@ -26,5 +26,6 @@ class Gateway(QueryModel, table=True): disable_device_pairing: bool = Field(default=False) workspace_root: str allow_insecure_tls: bool = Field(default=False) + main_session_key: str | None = Field(default=None) created_at: datetime = Field(default_factory=utcnow) updated_at: datetime = Field(default_factory=utcnow) diff --git a/backend/app/schemas/provider_credentials.py b/backend/app/schemas/provider_credentials.py index 46dba5b..25a40d4 100644 --- a/backend/app/schemas/provider_credentials.py +++ b/backend/app/schemas/provider_credentials.py @@ -61,6 +61,10 @@ class ProviderUsageLiveRead(SQLModel): input_tokens: TokenWindowRead # Anthropic splits input tokens separately requests: RequestWindowRead models: list[str] = [] + sample_model: str | None = None + sample_input_tokens: int | None = None + sample_output_tokens: int | None = None + sample_latency_ms: int | None = None # Optional debugging aid: exact rate-limit header names returned by provider. debug_rate_limit_headers: list[str] | None = None diff --git a/backend/app/services/provider_usage.py b/backend/app/services/provider_usage.py index 6dcae63..d2ce6c2 100644 --- a/backend/app/services/provider_usage.py +++ b/backend/app/services/provider_usage.py @@ -13,14 +13,20 @@ anthropic → GET https://api.anthropic.com/v1/models Headers: anthropic-ratelimit-tokens-limit/remaining/reset anthropic-ratelimit-requests-limit/remaining/reset anthropic-ratelimit-input-tokens-limit/remaining/reset + Fallback probe (only when headers missing): + POST /v1/messages with max_tokens=1 to surface usage+time data. openai → GET https://api.openai.com/v1/models (codex) Headers: x-ratelimit-limit-tokens, x-ratelimit-remaining-tokens, x-ratelimit-reset-tokens, x-ratelimit-limit-requests, x-ratelimit-remaining-requests, x-ratelimit-reset-requests + Fallback probe (only when headers missing): + POST /v1/chat/completions with max_tokens=1 to surface usage+time. ollama → GET {base_url}/api/tags (health-check only; no rate limits) Returns: model list, server reachable flag + Fallback probe: + POST {base_url}/api/generate with num_predict=1 for usage+time. Caching ------- @@ -102,16 +108,24 @@ class ProviderUsageLive: requests: RequestWindow = field(default_factory=RequestWindow) models: list[str] = field(default_factory=list) # model IDs available on this key raw_headers: dict[str, str] = field(default_factory=dict) + sample_model: str | None = None + sample_input_tokens: int | None = None + sample_output_tokens: int | None = None + sample_latency_ms: int | None = None def to_dict(self) -> dict[str, Any]: def _window(w: TokenWindow | RequestWindow) -> dict[str, Any]: d: dict[str, Any] = {} - if hasattr(w, "limit"): d["limit"] = w.limit - if hasattr(w, "remaining"): d["remaining"] = w.remaining - if hasattr(w, "reset_in_ms"): d["reset_in_ms"] = w.reset_in_ms - if hasattr(w, "reset_at"): d["reset_at"] = w.reset_at.isoformat() if w.reset_at else None + if hasattr(w, "limit"): + d["limit"] = w.limit + if hasattr(w, "remaining"): + d["remaining"] = w.remaining + if hasattr(w, "reset_in_ms"): + d["reset_in_ms"] = w.reset_in_ms + if hasattr(w, "reset_at"): + d["reset_at"] = w.reset_at.isoformat() if w.reset_at else None if isinstance(w, TokenWindow): - d["used"] = w.used + d["used"] = w.used d["pct_used"] = w.pct_used return d @@ -125,6 +139,10 @@ class ProviderUsageLive: "input_tokens": _window(self.input_tokens), "requests": _window(self.requests), "models": self.models[:20], # cap for response size + "sample_model": self.sample_model, + "sample_input_tokens": self.sample_input_tokens, + "sample_output_tokens": self.sample_output_tokens, + "sample_latency_ms": self.sample_latency_ms, } @@ -163,6 +181,49 @@ _OAI_DURATION_RE = re.compile( ) +def _apply_anthropic_ratelimit_headers(result: ProviderUsageLive, headers: dict[str, str]) -> None: + """Populate Anthropic limit windows from response headers.""" + result.tokens = TokenWindow( + limit=_parse_int_header(headers, "anthropic-ratelimit-tokens-limit"), + remaining=_parse_int_header(headers, "anthropic-ratelimit-tokens-remaining"), + reset_at=_parse_iso_reset(headers.get("anthropic-ratelimit-tokens-reset", "")), + ) + result.input_tokens = TokenWindow( + limit=_parse_int_header(headers, "anthropic-ratelimit-input-tokens-limit"), + remaining=_parse_int_header(headers, "anthropic-ratelimit-input-tokens-remaining"), + reset_at=_parse_iso_reset(headers.get("anthropic-ratelimit-input-tokens-reset", "")), + ) + result.requests = RequestWindow( + limit=_parse_int_header(headers, "anthropic-ratelimit-requests-limit"), + remaining=_parse_int_header(headers, "anthropic-ratelimit-requests-remaining"), + reset_at=_parse_iso_reset(headers.get("anthropic-ratelimit-requests-reset", "")), + ) + + +def _pick_anthropic_probe_model(models: list[str]) -> str | None: + if not models: + return None + priorities = ("haiku", "sonnet", "opus") + lowered = [(m, m.lower()) for m in models] + for priority in priorities: + for original, lowered_name in lowered: + if priority in lowered_name: + return original + return models[0] + + +def _pick_openai_probe_model(models: list[str]) -> str | None: + if not models: + return None + priorities = ("gpt-4.1-mini", "gpt-4o-mini", "gpt-4.1", "gpt-4o", "o4-mini") + lowered = [(m, m.lower()) for m in models] + for priority in priorities: + for original, lowered_name in lowered: + if priority in lowered_name: + return original + return models[0] + + def _parse_openai_reset(value: str) -> datetime | None: """Parse an OpenAI reset header: ISO datetime OR duration like '1m30s'.""" if not value: @@ -219,23 +280,7 @@ async def _fetch_anthropic(api_key: str, base_url: str | None) -> ProviderUsageL result.reachable = True result.raw_headers = {k: v for k, v in h.items() if "ratelimit" in k} - # Token window (combined input+output) - result.tokens = TokenWindow( - limit = _parse_int_header(h, "anthropic-ratelimit-tokens-limit"), - remaining = _parse_int_header(h, "anthropic-ratelimit-tokens-remaining"), - reset_at = _parse_iso_reset(h.get("anthropic-ratelimit-tokens-reset", "")), - ) - # Input-token window (separate limit for input) - result.input_tokens = TokenWindow( - limit = _parse_int_header(h, "anthropic-ratelimit-input-tokens-limit"), - remaining = _parse_int_header(h, "anthropic-ratelimit-input-tokens-remaining"), - reset_at = _parse_iso_reset(h.get("anthropic-ratelimit-input-tokens-reset", "")), - ) - result.requests = RequestWindow( - limit = _parse_int_header(h, "anthropic-ratelimit-requests-limit"), - remaining = _parse_int_header(h, "anthropic-ratelimit-requests-remaining"), - reset_at = _parse_iso_reset(h.get("anthropic-ratelimit-requests-reset", "")), - ) + _apply_anthropic_ratelimit_headers(result, h) # Extract model IDs try: @@ -245,6 +290,56 @@ async def _fetch_anthropic(api_key: str, base_url: str | None) -> ProviderUsageL except Exception: pass + # Some tiers/paths may omit ratelimit headers on /v1/models. + # Fallback to a minimal /v1/messages probe so we can still surface usage/time. + if ( + result.tokens.limit is None + and result.input_tokens.limit is None + and result.requests.limit is None + ): + probe_model = _pick_anthropic_probe_model(result.models) + if probe_model: + result.sample_model = probe_model + async with httpx.AsyncClient(timeout=REQUEST_TIMEOUT) as client: + try: + probe_resp = await client.post( + f"{base}/v1/messages", + headers={ + "x-api-key": api_key, + "anthropic-version": "2023-06-01", + "content-type": "application/json", + }, + json={ + "model": probe_model, + "max_tokens": 1, + "messages": [{"role": "user", "content": "Usage probe"}], + }, + ) + except Exception: + probe_resp = None + + if probe_resp is not None: + probe_headers = {k.lower(): v for k, v in probe_resp.headers.items()} + probe_rl_headers = {k: v for k, v in probe_headers.items() if "ratelimit" in k} + if probe_rl_headers: + result.raw_headers = probe_rl_headers + _apply_anthropic_ratelimit_headers(result, probe_headers) + if probe_resp.status_code == 200: + try: + payload = probe_resp.json() + usage = payload.get("usage") if isinstance(payload, dict) else None + if isinstance(usage, dict): + in_tok = usage.get("input_tokens") + out_tok = usage.get("output_tokens") + if isinstance(in_tok, int): + result.sample_input_tokens = in_tok + if isinstance(out_tok, int): + result.sample_output_tokens = out_tok + except Exception: + pass + elapsed_ms = probe_resp.elapsed.total_seconds() * 1000.0 + result.sample_latency_ms = int(max(0.0, round(elapsed_ms))) + return result @@ -296,6 +391,63 @@ async def _fetch_openai(api_key: str, base_url: str | None) -> ProviderUsageLive except Exception: pass + if result.tokens.limit is None and result.requests.limit is None: + probe_model = _pick_openai_probe_model(result.models) + if probe_model: + result.sample_model = probe_model + async with httpx.AsyncClient(timeout=REQUEST_TIMEOUT) as client: + try: + probe_resp = await client.post( + f"{base}/v1/chat/completions", + headers={ + "Authorization": f"Bearer {api_key}", + "content-type": "application/json", + }, + json={ + "model": probe_model, + "messages": [{"role": "user", "content": "Usage probe"}], + "max_tokens": 1, + }, + ) + except Exception: + probe_resp = None + if probe_resp is not None: + probe_headers = {k.lower(): v for k, v in probe_resp.headers.items()} + probe_rl_headers = {k: v for k, v in probe_headers.items() if "ratelimit" in k} + if probe_rl_headers: + result.raw_headers = probe_rl_headers + result.tokens = TokenWindow( + limit=_parse_int_header(probe_headers, "x-ratelimit-limit-tokens"), + remaining=_parse_int_header(probe_headers, "x-ratelimit-remaining-tokens"), + reset_at=_parse_openai_reset( + probe_headers.get("x-ratelimit-reset-tokens", "") + ), + ) + result.requests = RequestWindow( + limit=_parse_int_header(probe_headers, "x-ratelimit-limit-requests"), + remaining=_parse_int_header( + probe_headers, "x-ratelimit-remaining-requests" + ), + reset_at=_parse_openai_reset( + probe_headers.get("x-ratelimit-reset-requests", "") + ), + ) + if probe_resp.status_code == 200: + try: + payload = probe_resp.json() + usage = payload.get("usage") if isinstance(payload, dict) else None + if isinstance(usage, dict): + in_tok = usage.get("prompt_tokens") + out_tok = usage.get("completion_tokens") + if isinstance(in_tok, int): + result.sample_input_tokens = in_tok + if isinstance(out_tok, int): + result.sample_output_tokens = out_tok + except Exception: + pass + elapsed_ms = probe_resp.elapsed.total_seconds() * 1000.0 + result.sample_latency_ms = int(max(0.0, round(elapsed_ms))) + return result @@ -331,6 +483,37 @@ async def _fetch_ollama(base_url: str | None, api_key: str | None) -> ProviderUs except Exception: pass + if result.models: + result.sample_model = result.models[0] + async with httpx.AsyncClient(timeout=REQUEST_TIMEOUT) as client: + try: + probe_resp = await client.post( + f"{base}/api/generate", + headers={**headers, "content-type": "application/json"}, + json={ + "model": result.sample_model, + "prompt": "Usage probe", + "stream": False, + "options": {"num_predict": 1}, + }, + ) + except Exception: + probe_resp = None + if probe_resp is not None and probe_resp.status_code == 200: + try: + payload = probe_resp.json() + in_tok = payload.get("prompt_eval_count") + out_tok = payload.get("eval_count") + total_duration_ns = payload.get("total_duration") + if isinstance(in_tok, int): + result.sample_input_tokens = in_tok + if isinstance(out_tok, int): + result.sample_output_tokens = out_tok + if isinstance(total_duration_ns, int): + result.sample_latency_ms = max(0, int(round(total_duration_ns / 1_000_000))) + except Exception: + pass + return result diff --git a/backend/migrations/versions/c4a1d2e3f4a5_make_main_session_key_nullable.py b/backend/migrations/versions/c4a1d2e3f4a5_make_main_session_key_nullable.py new file mode 100644 index 0000000..6aba4b3 --- /dev/null +++ b/backend/migrations/versions/c4a1d2e3f4a5_make_main_session_key_nullable.py @@ -0,0 +1,34 @@ +"""Make main_session_key nullable on gateways. + +The column was NOT NULL but the ORM model didn't include it, causing +INSERT failures when creating gateways via the API. The field gets +populated by ensure_main_agent() after the row exists, so it needs +to be nullable during the initial INSERT. + +Revision ID: c4a1d2e3f4a5 +Revises: f7d8e9a0b1c2 +Create Date: 2026-05-21 04:25:00.000000 + +""" + +from __future__ import annotations + +import sqlalchemy as sa +from alembic import op + + +# revision identifiers, used by Alembic. +revision = "c4a1d2e3f4a5" +down_revision = "f7d8e9a0b1c2" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + """Allow main_session_key to be NULL on initial insert.""" + op.alter_column("gateways", "main_session_key", nullable=True) + + +def downgrade() -> None: + """Revert main_session_key to NOT NULL.""" + op.alter_column("gateways", "main_session_key", nullable=False) \ No newline at end of file diff --git a/backend/tests/test_provider_credentials_usage_api.py b/backend/tests/test_provider_credentials_usage_api.py index 3465784..5754844 100644 --- a/backend/tests/test_provider_credentials_usage_api.py +++ b/backend/tests/test_provider_credentials_usage_api.py @@ -89,6 +89,10 @@ async def test_usage_response_includes_rate_limit_header_names(monkeypatch: pyte checked_at=utcnow(), reachable=True, ) + result.sample_model = "claude-sonnet-4-6" + result.sample_input_tokens = 9 + result.sample_output_tokens = 1 + result.sample_latency_ms = 123 result.raw_headers = { "anthropic-ratelimit-requests-limit": "1000", "anthropic-ratelimit-requests-remaining": "999", @@ -117,6 +121,10 @@ async def test_usage_response_includes_rate_limit_header_names(monkeypatch: pyte data = response.json() assert data["provider"] == "anthropic" assert data["reachable"] is True + assert data["sample_model"] == "claude-sonnet-4-6" + assert data["sample_input_tokens"] == 9 + assert data["sample_output_tokens"] == 1 + assert data["sample_latency_ms"] == 123 assert data["debug_rate_limit_headers"] == [ "anthropic-ratelimit-requests-limit", "anthropic-ratelimit-requests-remaining", @@ -150,6 +158,10 @@ async def test_test_endpoint_returns_live_result(monkeypatch: pytest.MonkeyPatch reachable=True, ) result.models = ["claude-sonnet-4-6"] + result.sample_model = "claude-sonnet-4-6" + result.sample_input_tokens = 8 + result.sample_output_tokens = 1 + result.sample_latency_ms = 111 result.raw_headers = { "anthropic-ratelimit-tokens-limit": "100000", } @@ -184,6 +196,10 @@ async def test_test_endpoint_returns_live_result(monkeypatch: pytest.MonkeyPatch assert data["account_key"] == "Claude" assert data["reachable"] is True assert data["models"] == ["claude-sonnet-4-6"] + assert data["sample_model"] == "claude-sonnet-4-6" + assert data["sample_input_tokens"] == 8 + assert data["sample_output_tokens"] == 1 + assert data["sample_latency_ms"] == 111 assert data["debug_rate_limit_headers"] == ["anthropic-ratelimit-tokens-limit"] finally: await engine.dispose() diff --git a/frontend/src/api/generated/model/providerUsageLiveRead.ts b/frontend/src/api/generated/model/providerUsageLiveRead.ts index 23c2a4c..1bd1967 100644 --- a/frontend/src/api/generated/model/providerUsageLiveRead.ts +++ b/frontend/src/api/generated/model/providerUsageLiveRead.ts @@ -20,4 +20,9 @@ export interface ProviderUsageLiveRead { input_tokens: TokenWindowRead; requests: RequestWindowRead; models?: string[]; + sample_model?: string | null; + sample_input_tokens?: number | null; + sample_output_tokens?: number | null; + sample_latency_ms?: number | null; + debug_rate_limit_headers?: string[] | null; } diff --git a/frontend/src/app/settings/ai-providers/page.tsx b/frontend/src/app/settings/ai-providers/page.tsx index e3d6673..e8874c2 100644 --- a/frontend/src/app/settings/ai-providers/page.tsx +++ b/frontend/src/app/settings/ai-providers/page.tsx @@ -236,8 +236,17 @@ function CredentialForm({ {testResult.reachable ? "Connection successful" : "Connection failed"}
- {testResult.error ?? `${testResult.models?.length ?? 0} model${(testResult.models?.length ?? 0) === 1 ? "" : "s"} returned`} + {testResult.error ?? ( + testResult.sample_input_tokens != null || testResult.sample_output_tokens != null + ? `Usage probe: in ${fmtTokens(testResult.sample_input_tokens)} · out ${fmtTokens(testResult.sample_output_tokens)}` + : "Connected." + )}
+ {testResult.sample_latency_ms != null && ( ++ Probe time: {fmtLatencyMs(testResult.sample_latency_ms)} +
+ )} )} @@ -267,6 +276,12 @@ function fmtResetMs(ms: number | null | undefined): string { return `${h}h ${m % 60}m`; } +function fmtLatencyMs(ms: number | null | undefined): string { + if (ms == null || ms < 0) return "—"; + if (ms < 1000) return `${ms}ms`; + return `${(ms / 1000).toFixed(2)}s`; +} + function UsageStrip({ credentialId, provider }: { credentialId: string; provider: string }) { const [usage, setUsage] = useState- Connected — provider did not return token/request limit headers for this key tier. + Connected — no token/request limit windows were returned for this key right now.
)}