From 03bc31a558745353dc0088845cfe01052e1662fe Mon Sep 17 00:00:00 2001
From: null
Date: Wed, 20 May 2026 23:22:54 -0500
Subject: [PATCH] fix(db): make main_session_key nullable on gateways
The column was NOT NULL but the ORM create path doesn't populate it
until ensure_main_agent() runs after INSERT. Make it nullable so the
initial create succeeds.
---
backend/app/api/provider_credentials.py | 8 +
backend/app/models/gateways.py | 1 +
backend/app/schemas/provider_credentials.py | 4 +
backend/app/services/provider_usage.py | 227 ++++++++++++++++--
...d2e3f4a5_make_main_session_key_nullable.py | 34 +++
.../test_provider_credentials_usage_api.py | 16 ++
.../generated/model/providerUsageLiveRead.ts | 5 +
.../src/app/settings/ai-providers/page.tsx | 75 ++++--
8 files changed, 332 insertions(+), 38 deletions(-)
create mode 100644 backend/migrations/versions/c4a1d2e3f4a5_make_main_session_key_nullable.py
diff --git a/backend/app/api/provider_credentials.py b/backend/app/api/provider_credentials.py
index 2734639..1a4db21 100644
--- a/backend/app/api/provider_credentials.py
+++ b/backend/app/api/provider_credentials.py
@@ -167,6 +167,10 @@ async def test_provider_credential(
input_tokens=_tok(live.input_tokens),
requests=_req(live.requests),
models=live.models,
+ sample_model=live.sample_model,
+ sample_input_tokens=live.sample_input_tokens,
+ sample_output_tokens=live.sample_output_tokens,
+ sample_latency_ms=live.sample_latency_ms,
debug_rate_limit_headers=sorted(live.raw_headers.keys()) if live.raw_headers else None,
)
@@ -265,6 +269,10 @@ async def get_provider_usage_live(
input_tokens=_tok(live.input_tokens),
requests=_req(live.requests),
models=live.models,
+ sample_model=live.sample_model,
+ sample_input_tokens=live.sample_input_tokens,
+ sample_output_tokens=live.sample_output_tokens,
+ sample_latency_ms=live.sample_latency_ms,
debug_rate_limit_headers=sorted(live.raw_headers.keys()) if live.raw_headers else None,
)
diff --git a/backend/app/models/gateways.py b/backend/app/models/gateways.py
index 3451a10..68e6776 100644
--- a/backend/app/models/gateways.py
+++ b/backend/app/models/gateways.py
@@ -26,5 +26,6 @@ class Gateway(QueryModel, table=True):
disable_device_pairing: bool = Field(default=False)
workspace_root: str
allow_insecure_tls: bool = Field(default=False)
+ main_session_key: str | None = Field(default=None)
created_at: datetime = Field(default_factory=utcnow)
updated_at: datetime = Field(default_factory=utcnow)
diff --git a/backend/app/schemas/provider_credentials.py b/backend/app/schemas/provider_credentials.py
index 46dba5b..25a40d4 100644
--- a/backend/app/schemas/provider_credentials.py
+++ b/backend/app/schemas/provider_credentials.py
@@ -61,6 +61,10 @@ class ProviderUsageLiveRead(SQLModel):
input_tokens: TokenWindowRead # Anthropic splits input tokens separately
requests: RequestWindowRead
models: list[str] = []
+ sample_model: str | None = None
+ sample_input_tokens: int | None = None
+ sample_output_tokens: int | None = None
+ sample_latency_ms: int | None = None
# Optional debugging aid: exact rate-limit header names returned by provider.
debug_rate_limit_headers: list[str] | None = None
diff --git a/backend/app/services/provider_usage.py b/backend/app/services/provider_usage.py
index 6dcae63..d2ce6c2 100644
--- a/backend/app/services/provider_usage.py
+++ b/backend/app/services/provider_usage.py
@@ -13,14 +13,20 @@ anthropic → GET https://api.anthropic.com/v1/models
Headers: anthropic-ratelimit-tokens-limit/remaining/reset
anthropic-ratelimit-requests-limit/remaining/reset
anthropic-ratelimit-input-tokens-limit/remaining/reset
+ Fallback probe (only when headers missing):
+ POST /v1/messages with max_tokens=1 to surface usage+time data.
openai → GET https://api.openai.com/v1/models
(codex) Headers: x-ratelimit-limit-tokens, x-ratelimit-remaining-tokens,
x-ratelimit-reset-tokens, x-ratelimit-limit-requests,
x-ratelimit-remaining-requests, x-ratelimit-reset-requests
+ Fallback probe (only when headers missing):
+ POST /v1/chat/completions with max_tokens=1 to surface usage+time.
ollama → GET {base_url}/api/tags (health-check only; no rate limits)
Returns: model list, server reachable flag
+ Fallback probe:
+ POST {base_url}/api/generate with num_predict=1 for usage+time.
Caching
-------
@@ -102,16 +108,24 @@ class ProviderUsageLive:
requests: RequestWindow = field(default_factory=RequestWindow)
models: list[str] = field(default_factory=list) # model IDs available on this key
raw_headers: dict[str, str] = field(default_factory=dict)
+ sample_model: str | None = None
+ sample_input_tokens: int | None = None
+ sample_output_tokens: int | None = None
+ sample_latency_ms: int | None = None
def to_dict(self) -> dict[str, Any]:
def _window(w: TokenWindow | RequestWindow) -> dict[str, Any]:
d: dict[str, Any] = {}
- if hasattr(w, "limit"): d["limit"] = w.limit
- if hasattr(w, "remaining"): d["remaining"] = w.remaining
- if hasattr(w, "reset_in_ms"): d["reset_in_ms"] = w.reset_in_ms
- if hasattr(w, "reset_at"): d["reset_at"] = w.reset_at.isoformat() if w.reset_at else None
+ if hasattr(w, "limit"):
+ d["limit"] = w.limit
+ if hasattr(w, "remaining"):
+ d["remaining"] = w.remaining
+ if hasattr(w, "reset_in_ms"):
+ d["reset_in_ms"] = w.reset_in_ms
+ if hasattr(w, "reset_at"):
+ d["reset_at"] = w.reset_at.isoformat() if w.reset_at else None
if isinstance(w, TokenWindow):
- d["used"] = w.used
+ d["used"] = w.used
d["pct_used"] = w.pct_used
return d
@@ -125,6 +139,10 @@ class ProviderUsageLive:
"input_tokens": _window(self.input_tokens),
"requests": _window(self.requests),
"models": self.models[:20], # cap for response size
+ "sample_model": self.sample_model,
+ "sample_input_tokens": self.sample_input_tokens,
+ "sample_output_tokens": self.sample_output_tokens,
+ "sample_latency_ms": self.sample_latency_ms,
}
@@ -163,6 +181,49 @@ _OAI_DURATION_RE = re.compile(
)
+def _apply_anthropic_ratelimit_headers(result: ProviderUsageLive, headers: dict[str, str]) -> None:
+ """Populate Anthropic limit windows from response headers."""
+ result.tokens = TokenWindow(
+ limit=_parse_int_header(headers, "anthropic-ratelimit-tokens-limit"),
+ remaining=_parse_int_header(headers, "anthropic-ratelimit-tokens-remaining"),
+ reset_at=_parse_iso_reset(headers.get("anthropic-ratelimit-tokens-reset", "")),
+ )
+ result.input_tokens = TokenWindow(
+ limit=_parse_int_header(headers, "anthropic-ratelimit-input-tokens-limit"),
+ remaining=_parse_int_header(headers, "anthropic-ratelimit-input-tokens-remaining"),
+ reset_at=_parse_iso_reset(headers.get("anthropic-ratelimit-input-tokens-reset", "")),
+ )
+ result.requests = RequestWindow(
+ limit=_parse_int_header(headers, "anthropic-ratelimit-requests-limit"),
+ remaining=_parse_int_header(headers, "anthropic-ratelimit-requests-remaining"),
+ reset_at=_parse_iso_reset(headers.get("anthropic-ratelimit-requests-reset", "")),
+ )
+
+
+def _pick_anthropic_probe_model(models: list[str]) -> str | None:
+ if not models:
+ return None
+ priorities = ("haiku", "sonnet", "opus")
+ lowered = [(m, m.lower()) for m in models]
+ for priority in priorities:
+ for original, lowered_name in lowered:
+ if priority in lowered_name:
+ return original
+ return models[0]
+
+
+def _pick_openai_probe_model(models: list[str]) -> str | None:
+ if not models:
+ return None
+ priorities = ("gpt-4.1-mini", "gpt-4o-mini", "gpt-4.1", "gpt-4o", "o4-mini")
+ lowered = [(m, m.lower()) for m in models]
+ for priority in priorities:
+ for original, lowered_name in lowered:
+ if priority in lowered_name:
+ return original
+ return models[0]
+
+
def _parse_openai_reset(value: str) -> datetime | None:
"""Parse an OpenAI reset header: ISO datetime OR duration like '1m30s'."""
if not value:
@@ -219,23 +280,7 @@ async def _fetch_anthropic(api_key: str, base_url: str | None) -> ProviderUsageL
result.reachable = True
result.raw_headers = {k: v for k, v in h.items() if "ratelimit" in k}
- # Token window (combined input+output)
- result.tokens = TokenWindow(
- limit = _parse_int_header(h, "anthropic-ratelimit-tokens-limit"),
- remaining = _parse_int_header(h, "anthropic-ratelimit-tokens-remaining"),
- reset_at = _parse_iso_reset(h.get("anthropic-ratelimit-tokens-reset", "")),
- )
- # Input-token window (separate limit for input)
- result.input_tokens = TokenWindow(
- limit = _parse_int_header(h, "anthropic-ratelimit-input-tokens-limit"),
- remaining = _parse_int_header(h, "anthropic-ratelimit-input-tokens-remaining"),
- reset_at = _parse_iso_reset(h.get("anthropic-ratelimit-input-tokens-reset", "")),
- )
- result.requests = RequestWindow(
- limit = _parse_int_header(h, "anthropic-ratelimit-requests-limit"),
- remaining = _parse_int_header(h, "anthropic-ratelimit-requests-remaining"),
- reset_at = _parse_iso_reset(h.get("anthropic-ratelimit-requests-reset", "")),
- )
+ _apply_anthropic_ratelimit_headers(result, h)
# Extract model IDs
try:
@@ -245,6 +290,56 @@ async def _fetch_anthropic(api_key: str, base_url: str | None) -> ProviderUsageL
except Exception:
pass
+ # Some tiers/paths may omit ratelimit headers on /v1/models.
+ # Fallback to a minimal /v1/messages probe so we can still surface usage/time.
+ if (
+ result.tokens.limit is None
+ and result.input_tokens.limit is None
+ and result.requests.limit is None
+ ):
+ probe_model = _pick_anthropic_probe_model(result.models)
+ if probe_model:
+ result.sample_model = probe_model
+ async with httpx.AsyncClient(timeout=REQUEST_TIMEOUT) as client:
+ try:
+ probe_resp = await client.post(
+ f"{base}/v1/messages",
+ headers={
+ "x-api-key": api_key,
+ "anthropic-version": "2023-06-01",
+ "content-type": "application/json",
+ },
+ json={
+ "model": probe_model,
+ "max_tokens": 1,
+ "messages": [{"role": "user", "content": "Usage probe"}],
+ },
+ )
+ except Exception:
+ probe_resp = None
+
+ if probe_resp is not None:
+ probe_headers = {k.lower(): v for k, v in probe_resp.headers.items()}
+ probe_rl_headers = {k: v for k, v in probe_headers.items() if "ratelimit" in k}
+ if probe_rl_headers:
+ result.raw_headers = probe_rl_headers
+ _apply_anthropic_ratelimit_headers(result, probe_headers)
+ if probe_resp.status_code == 200:
+ try:
+ payload = probe_resp.json()
+ usage = payload.get("usage") if isinstance(payload, dict) else None
+ if isinstance(usage, dict):
+ in_tok = usage.get("input_tokens")
+ out_tok = usage.get("output_tokens")
+ if isinstance(in_tok, int):
+ result.sample_input_tokens = in_tok
+ if isinstance(out_tok, int):
+ result.sample_output_tokens = out_tok
+ except Exception:
+ pass
+ elapsed_ms = probe_resp.elapsed.total_seconds() * 1000.0
+ result.sample_latency_ms = int(max(0.0, round(elapsed_ms)))
+
return result
@@ -296,6 +391,63 @@ async def _fetch_openai(api_key: str, base_url: str | None) -> ProviderUsageLive
except Exception:
pass
+ if result.tokens.limit is None and result.requests.limit is None:
+ probe_model = _pick_openai_probe_model(result.models)
+ if probe_model:
+ result.sample_model = probe_model
+ async with httpx.AsyncClient(timeout=REQUEST_TIMEOUT) as client:
+ try:
+ probe_resp = await client.post(
+ f"{base}/v1/chat/completions",
+ headers={
+ "Authorization": f"Bearer {api_key}",
+ "content-type": "application/json",
+ },
+ json={
+ "model": probe_model,
+ "messages": [{"role": "user", "content": "Usage probe"}],
+ "max_tokens": 1,
+ },
+ )
+ except Exception:
+ probe_resp = None
+ if probe_resp is not None:
+ probe_headers = {k.lower(): v for k, v in probe_resp.headers.items()}
+ probe_rl_headers = {k: v for k, v in probe_headers.items() if "ratelimit" in k}
+ if probe_rl_headers:
+ result.raw_headers = probe_rl_headers
+ result.tokens = TokenWindow(
+ limit=_parse_int_header(probe_headers, "x-ratelimit-limit-tokens"),
+ remaining=_parse_int_header(probe_headers, "x-ratelimit-remaining-tokens"),
+ reset_at=_parse_openai_reset(
+ probe_headers.get("x-ratelimit-reset-tokens", "")
+ ),
+ )
+ result.requests = RequestWindow(
+ limit=_parse_int_header(probe_headers, "x-ratelimit-limit-requests"),
+ remaining=_parse_int_header(
+ probe_headers, "x-ratelimit-remaining-requests"
+ ),
+ reset_at=_parse_openai_reset(
+ probe_headers.get("x-ratelimit-reset-requests", "")
+ ),
+ )
+ if probe_resp.status_code == 200:
+ try:
+ payload = probe_resp.json()
+ usage = payload.get("usage") if isinstance(payload, dict) else None
+ if isinstance(usage, dict):
+ in_tok = usage.get("prompt_tokens")
+ out_tok = usage.get("completion_tokens")
+ if isinstance(in_tok, int):
+ result.sample_input_tokens = in_tok
+ if isinstance(out_tok, int):
+ result.sample_output_tokens = out_tok
+ except Exception:
+ pass
+ elapsed_ms = probe_resp.elapsed.total_seconds() * 1000.0
+ result.sample_latency_ms = int(max(0.0, round(elapsed_ms)))
+
return result
@@ -331,6 +483,37 @@ async def _fetch_ollama(base_url: str | None, api_key: str | None) -> ProviderUs
except Exception:
pass
+ if result.models:
+ result.sample_model = result.models[0]
+ async with httpx.AsyncClient(timeout=REQUEST_TIMEOUT) as client:
+ try:
+ probe_resp = await client.post(
+ f"{base}/api/generate",
+ headers={**headers, "content-type": "application/json"},
+ json={
+ "model": result.sample_model,
+ "prompt": "Usage probe",
+ "stream": False,
+ "options": {"num_predict": 1},
+ },
+ )
+ except Exception:
+ probe_resp = None
+ if probe_resp is not None and probe_resp.status_code == 200:
+ try:
+ payload = probe_resp.json()
+ in_tok = payload.get("prompt_eval_count")
+ out_tok = payload.get("eval_count")
+ total_duration_ns = payload.get("total_duration")
+ if isinstance(in_tok, int):
+ result.sample_input_tokens = in_tok
+ if isinstance(out_tok, int):
+ result.sample_output_tokens = out_tok
+ if isinstance(total_duration_ns, int):
+ result.sample_latency_ms = max(0, int(round(total_duration_ns / 1_000_000)))
+ except Exception:
+ pass
+
return result
diff --git a/backend/migrations/versions/c4a1d2e3f4a5_make_main_session_key_nullable.py b/backend/migrations/versions/c4a1d2e3f4a5_make_main_session_key_nullable.py
new file mode 100644
index 0000000..6aba4b3
--- /dev/null
+++ b/backend/migrations/versions/c4a1d2e3f4a5_make_main_session_key_nullable.py
@@ -0,0 +1,34 @@
+"""Make main_session_key nullable on gateways.
+
+The column was NOT NULL but the ORM model didn't include it, causing
+INSERT failures when creating gateways via the API. The field gets
+populated by ensure_main_agent() after the row exists, so it needs
+to be nullable during the initial INSERT.
+
+Revision ID: c4a1d2e3f4a5
+Revises: f7d8e9a0b1c2
+Create Date: 2026-05-21 04:25:00.000000
+
+"""
+
+from __future__ import annotations
+
+import sqlalchemy as sa
+from alembic import op
+
+
+# revision identifiers, used by Alembic.
+revision = "c4a1d2e3f4a5"
+down_revision = "f7d8e9a0b1c2"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+ """Allow main_session_key to be NULL on initial insert."""
+ op.alter_column("gateways", "main_session_key", nullable=True)
+
+
+def downgrade() -> None:
+ """Revert main_session_key to NOT NULL."""
+ op.alter_column("gateways", "main_session_key", nullable=False)
\ No newline at end of file
diff --git a/backend/tests/test_provider_credentials_usage_api.py b/backend/tests/test_provider_credentials_usage_api.py
index 3465784..5754844 100644
--- a/backend/tests/test_provider_credentials_usage_api.py
+++ b/backend/tests/test_provider_credentials_usage_api.py
@@ -89,6 +89,10 @@ async def test_usage_response_includes_rate_limit_header_names(monkeypatch: pyte
checked_at=utcnow(),
reachable=True,
)
+ result.sample_model = "claude-sonnet-4-6"
+ result.sample_input_tokens = 9
+ result.sample_output_tokens = 1
+ result.sample_latency_ms = 123
result.raw_headers = {
"anthropic-ratelimit-requests-limit": "1000",
"anthropic-ratelimit-requests-remaining": "999",
@@ -117,6 +121,10 @@ async def test_usage_response_includes_rate_limit_header_names(monkeypatch: pyte
data = response.json()
assert data["provider"] == "anthropic"
assert data["reachable"] is True
+ assert data["sample_model"] == "claude-sonnet-4-6"
+ assert data["sample_input_tokens"] == 9
+ assert data["sample_output_tokens"] == 1
+ assert data["sample_latency_ms"] == 123
assert data["debug_rate_limit_headers"] == [
"anthropic-ratelimit-requests-limit",
"anthropic-ratelimit-requests-remaining",
@@ -150,6 +158,10 @@ async def test_test_endpoint_returns_live_result(monkeypatch: pytest.MonkeyPatch
reachable=True,
)
result.models = ["claude-sonnet-4-6"]
+ result.sample_model = "claude-sonnet-4-6"
+ result.sample_input_tokens = 8
+ result.sample_output_tokens = 1
+ result.sample_latency_ms = 111
result.raw_headers = {
"anthropic-ratelimit-tokens-limit": "100000",
}
@@ -184,6 +196,10 @@ async def test_test_endpoint_returns_live_result(monkeypatch: pytest.MonkeyPatch
assert data["account_key"] == "Claude"
assert data["reachable"] is True
assert data["models"] == ["claude-sonnet-4-6"]
+ assert data["sample_model"] == "claude-sonnet-4-6"
+ assert data["sample_input_tokens"] == 8
+ assert data["sample_output_tokens"] == 1
+ assert data["sample_latency_ms"] == 111
assert data["debug_rate_limit_headers"] == ["anthropic-ratelimit-tokens-limit"]
finally:
await engine.dispose()
diff --git a/frontend/src/api/generated/model/providerUsageLiveRead.ts b/frontend/src/api/generated/model/providerUsageLiveRead.ts
index 23c2a4c..1bd1967 100644
--- a/frontend/src/api/generated/model/providerUsageLiveRead.ts
+++ b/frontend/src/api/generated/model/providerUsageLiveRead.ts
@@ -20,4 +20,9 @@ export interface ProviderUsageLiveRead {
input_tokens: TokenWindowRead;
requests: RequestWindowRead;
models?: string[];
+ sample_model?: string | null;
+ sample_input_tokens?: number | null;
+ sample_output_tokens?: number | null;
+ sample_latency_ms?: number | null;
+ debug_rate_limit_headers?: string[] | null;
}
diff --git a/frontend/src/app/settings/ai-providers/page.tsx b/frontend/src/app/settings/ai-providers/page.tsx
index e3d6673..e8874c2 100644
--- a/frontend/src/app/settings/ai-providers/page.tsx
+++ b/frontend/src/app/settings/ai-providers/page.tsx
@@ -236,8 +236,17 @@ function CredentialForm({
{testResult.reachable ? "Connection successful" : "Connection failed"}
- {testResult.error ?? `${testResult.models?.length ?? 0} model${(testResult.models?.length ?? 0) === 1 ? "" : "s"} returned`}
+ {testResult.error ?? (
+ testResult.sample_input_tokens != null || testResult.sample_output_tokens != null
+ ? `Usage probe: in ${fmtTokens(testResult.sample_input_tokens)} · out ${fmtTokens(testResult.sample_output_tokens)}`
+ : "Connected."
+ )}
+ {testResult.sample_latency_ms != null && (
+
+ Probe time: {fmtLatencyMs(testResult.sample_latency_ms)}
+
+ )}
)}
@@ -267,6 +276,12 @@ function fmtResetMs(ms: number | null | undefined): string {
return `${h}h ${m % 60}m`;
}
+function fmtLatencyMs(ms: number | null | undefined): string {
+ if (ms == null || ms < 0) return "—";
+ if (ms < 1000) return `${ms}ms`;
+ return `${(ms / 1000).toFixed(2)}s`;
+}
+
function UsageStrip({ credentialId, provider }: { credentialId: string; provider: string }) {
const [usage, setUsage] = useState(null);
const [loading, setLoading] = useState(true);
@@ -318,30 +333,58 @@ function UsageStrip({ credentialId, provider }: { credentialId: string; provider
const inputTok = usage.input_tokens;
const req = usage.requests;
const isOllama = provider === "ollama";
- const modelCount = usage.models?.length ?? 0;
return (
{isOllama ? (
-
-
-
- Connected
-
- {(usage.models?.length ?? 0) > 0 && (
-
{usage.models!.length} model{usage.models!.length !== 1 ? "s" : ""} available
+
+
+
+
+ Connected
+
+ {(usage.models?.length ?? 0) > 0 && (
+ {usage.models!.length} model{usage.models!.length !== 1 ? "s" : ""} available
+ )}
+ fetch(true)} className="ml-auto text-muted hover:text-strong">
+
+
+
+ {(usage.sample_input_tokens != null || usage.sample_output_tokens != null) && (
+
+ Usage (last probe)
+
+ in {fmtTokens(usage.sample_input_tokens)} · out {fmtTokens(usage.sample_output_tokens)}
+
+
)}
-
fetch(true)} className="ml-auto text-muted hover:text-strong">
-
-
+ {usage.sample_latency_ms != null && (
+
+ Time (last probe)
+
+ {fmtLatencyMs(usage.sample_latency_ms)}
+
+
+ )}
+ {usage.sample_latency_ms != null && (
+
+ Time (last probe)
+
+ {fmtLatencyMs(usage.sample_latency_ms)}
+
+
+ )}
+
+ {lastFetched && Updated {Math.round((Date.now() - lastFetched.getTime()) / 1000)}s ago }
+
) : (
- {modelCount > 0 && (
+ {(usage.sample_input_tokens != null || usage.sample_output_tokens != null) && (
- Models
+ Usage (last probe)
- {modelCount} available
+ in {fmtTokens(usage.sample_input_tokens)} · out {fmtTokens(usage.sample_output_tokens)}
)}
@@ -415,7 +458,7 @@ function UsageStrip({ credentialId, provider }: { credentialId: string; provider
{tok.limit == null && inputTok.limit == null && req.limit == null && (
- Connected — provider did not return token/request limit headers for this key tier.
+ Connected — no token/request limit windows were returned for this key right now.
)}