fix(db): make main_session_key nullable on gateways

The column was NOT NULL but the ORM create path doesn't populate it until ensure_main_agent() runs after INSERT. Make it nullable so the initial create succeeds.
2026-05-20 23:22:54 -05:00 · 2026-05-20 23:22:54 -05:00 · 03bc31a558
parent fc4094d49f
commit 03bc31a558
8 changed files with 332 additions and 38 deletions
--- a/backend/app/api/provider_credentials.py
+++ b/backend/app/api/provider_credentials.py
@ -167,6 +167,10 @@ async def test_provider_credential(
        input_tokens=_tok(live.input_tokens),
        requests=_req(live.requests),
        models=live.models,
+        sample_model=live.sample_model,
+        sample_input_tokens=live.sample_input_tokens,
+        sample_output_tokens=live.sample_output_tokens,
+        sample_latency_ms=live.sample_latency_ms,
        debug_rate_limit_headers=sorted(live.raw_headers.keys()) if live.raw_headers else None,
    )

@ -265,6 +269,10 @@ async def get_provider_usage_live(
        input_tokens=_tok(live.input_tokens),
        requests=_req(live.requests),
        models=live.models,
+        sample_model=live.sample_model,
+        sample_input_tokens=live.sample_input_tokens,
+        sample_output_tokens=live.sample_output_tokens,
+        sample_latency_ms=live.sample_latency_ms,
        debug_rate_limit_headers=sorted(live.raw_headers.keys()) if live.raw_headers else None,
    )

--- a/backend/app/models/gateways.py
+++ b/backend/app/models/gateways.py
@ -26,5 +26,6 @@ class Gateway(QueryModel, table=True):
    disable_device_pairing: bool = Field(default=False)
    workspace_root: str
    allow_insecure_tls: bool = Field(default=False)
+    main_session_key: str | None = Field(default=None)
    created_at: datetime = Field(default_factory=utcnow)
    updated_at: datetime = Field(default_factory=utcnow)
--- a/backend/app/schemas/provider_credentials.py
+++ b/backend/app/schemas/provider_credentials.py
@ -61,6 +61,10 @@ class ProviderUsageLiveRead(SQLModel):
    input_tokens: TokenWindowRead   # Anthropic splits input tokens separately
    requests: RequestWindowRead
    models: list[str] = []
+    sample_model: str | None = None
+    sample_input_tokens: int | None = None
+    sample_output_tokens: int | None = None
+    sample_latency_ms: int | None = None
    # Optional debugging aid: exact rate-limit header names returned by provider.
    debug_rate_limit_headers: list[str] | None = None

--- a/backend/app/services/provider_usage.py
+++ b/backend/app/services/provider_usage.py
@ -13,14 +13,20 @@ anthropic  → GET https://api.anthropic.com/v1/models
             Headers: anthropic-ratelimit-tokens-limit/remaining/reset
                      anthropic-ratelimit-requests-limit/remaining/reset
                      anthropic-ratelimit-input-tokens-limit/remaining/reset
+             Fallback probe (only when headers missing):
+             POST /v1/messages with max_tokens=1 to surface usage+time data.

 openai     → GET https://api.openai.com/v1/models
 (codex)      Headers: x-ratelimit-limit-tokens, x-ratelimit-remaining-tokens,
                      x-ratelimit-reset-tokens, x-ratelimit-limit-requests,
                      x-ratelimit-remaining-requests, x-ratelimit-reset-requests
+             Fallback probe (only when headers missing):
+             POST /v1/chat/completions with max_tokens=1 to surface usage+time.

 ollama     → GET {base_url}/api/tags  (health-check only; no rate limits)
             Returns: model list, server reachable flag
+             Fallback probe:
+             POST {base_url}/api/generate with num_predict=1 for usage+time.

 Caching
 -------
@ -102,16 +108,24 @@ class ProviderUsageLive:
    requests: RequestWindow = field(default_factory=RequestWindow)
    models: list[str] = field(default_factory=list)   # model IDs available on this key
    raw_headers: dict[str, str] = field(default_factory=dict)
+    sample_model: str | None = None
+    sample_input_tokens: int | None = None
+    sample_output_tokens: int | None = None
+    sample_latency_ms: int | None = None

    def to_dict(self) -> dict[str, Any]:
        def _window(w: TokenWindow | RequestWindow) -> dict[str, Any]:
            d: dict[str, Any] = {}
-            if hasattr(w, "limit"):        d["limit"]       = w.limit
-            if hasattr(w, "remaining"):    d["remaining"]   = w.remaining
-            if hasattr(w, "reset_in_ms"): d["reset_in_ms"] = w.reset_in_ms
-            if hasattr(w, "reset_at"):    d["reset_at"]    = w.reset_at.isoformat() if w.reset_at else None
+            if hasattr(w, "limit"):
+                d["limit"] = w.limit
+            if hasattr(w, "remaining"):
+                d["remaining"] = w.remaining
+            if hasattr(w, "reset_in_ms"):
+                d["reset_in_ms"] = w.reset_in_ms
+            if hasattr(w, "reset_at"):
+                d["reset_at"] = w.reset_at.isoformat() if w.reset_at else None
            if isinstance(w, TokenWindow):
-                d["used"]     = w.used
+                d["used"] = w.used
                d["pct_used"] = w.pct_used
            return d

@ -125,6 +139,10 @@ class ProviderUsageLive:
            "input_tokens": _window(self.input_tokens),
            "requests":    _window(self.requests),
            "models":      self.models[:20],  # cap for response size
+            "sample_model": self.sample_model,
+            "sample_input_tokens": self.sample_input_tokens,
+            "sample_output_tokens": self.sample_output_tokens,
+            "sample_latency_ms": self.sample_latency_ms,
        }


@ -163,6 +181,49 @@ _OAI_DURATION_RE = re.compile(
 )


+def _apply_anthropic_ratelimit_headers(result: ProviderUsageLive, headers: dict[str, str]) -> None:
+    """Populate Anthropic limit windows from response headers."""
+    result.tokens = TokenWindow(
+        limit=_parse_int_header(headers, "anthropic-ratelimit-tokens-limit"),
+        remaining=_parse_int_header(headers, "anthropic-ratelimit-tokens-remaining"),
+        reset_at=_parse_iso_reset(headers.get("anthropic-ratelimit-tokens-reset", "")),
+    )
+    result.input_tokens = TokenWindow(
+        limit=_parse_int_header(headers, "anthropic-ratelimit-input-tokens-limit"),
+        remaining=_parse_int_header(headers, "anthropic-ratelimit-input-tokens-remaining"),
+        reset_at=_parse_iso_reset(headers.get("anthropic-ratelimit-input-tokens-reset", "")),
+    )
+    result.requests = RequestWindow(
+        limit=_parse_int_header(headers, "anthropic-ratelimit-requests-limit"),
+        remaining=_parse_int_header(headers, "anthropic-ratelimit-requests-remaining"),
+        reset_at=_parse_iso_reset(headers.get("anthropic-ratelimit-requests-reset", "")),
+    )
+
+
+def _pick_anthropic_probe_model(models: list[str]) -> str | None:
+    if not models:
+        return None
+    priorities = ("haiku", "sonnet", "opus")
+    lowered = [(m, m.lower()) for m in models]
+    for priority in priorities:
+        for original, lowered_name in lowered:
+            if priority in lowered_name:
+                return original
+    return models[0]
+
+
+def _pick_openai_probe_model(models: list[str]) -> str | None:
+    if not models:
+        return None
+    priorities = ("gpt-4.1-mini", "gpt-4o-mini", "gpt-4.1", "gpt-4o", "o4-mini")
+    lowered = [(m, m.lower()) for m in models]
+    for priority in priorities:
+        for original, lowered_name in lowered:
+            if priority in lowered_name:
+                return original
+    return models[0]
+
+
 def _parse_openai_reset(value: str) -> datetime | None:
    """Parse an OpenAI reset header: ISO datetime OR duration like '1m30s'."""
    if not value:
@ -219,23 +280,7 @@ async def _fetch_anthropic(api_key: str, base_url: str | None) -> ProviderUsageL
    result.reachable = True
    result.raw_headers = {k: v for k, v in h.items() if "ratelimit" in k}

-    # Token window (combined input+output)
-    result.tokens = TokenWindow(
-        limit     = _parse_int_header(h, "anthropic-ratelimit-tokens-limit"),
-        remaining = _parse_int_header(h, "anthropic-ratelimit-tokens-remaining"),
-        reset_at  = _parse_iso_reset(h.get("anthropic-ratelimit-tokens-reset", "")),
-    )
-    # Input-token window (separate limit for input)
-    result.input_tokens = TokenWindow(
-        limit     = _parse_int_header(h, "anthropic-ratelimit-input-tokens-limit"),
-        remaining = _parse_int_header(h, "anthropic-ratelimit-input-tokens-remaining"),
-        reset_at  = _parse_iso_reset(h.get("anthropic-ratelimit-input-tokens-reset", "")),
-    )
-    result.requests = RequestWindow(
-        limit     = _parse_int_header(h, "anthropic-ratelimit-requests-limit"),
-        remaining = _parse_int_header(h, "anthropic-ratelimit-requests-remaining"),
-        reset_at  = _parse_iso_reset(h.get("anthropic-ratelimit-requests-reset", "")),
-    )
+    _apply_anthropic_ratelimit_headers(result, h)

    # Extract model IDs
    try:
@ -245,6 +290,56 @@ async def _fetch_anthropic(api_key: str, base_url: str | None) -> ProviderUsageL
    except Exception:
        pass

+    # Some tiers/paths may omit ratelimit headers on /v1/models.
+    # Fallback to a minimal /v1/messages probe so we can still surface usage/time.
+    if (
+        result.tokens.limit is None
+        and result.input_tokens.limit is None
+        and result.requests.limit is None
+    ):
+        probe_model = _pick_anthropic_probe_model(result.models)
+        if probe_model:
+            result.sample_model = probe_model
+            async with httpx.AsyncClient(timeout=REQUEST_TIMEOUT) as client:
+                try:
+                    probe_resp = await client.post(
+                        f"{base}/v1/messages",
+                        headers={
+                            "x-api-key": api_key,
+                            "anthropic-version": "2023-06-01",
+                            "content-type": "application/json",
+                        },
+                        json={
+                            "model": probe_model,
+                            "max_tokens": 1,
+                            "messages": [{"role": "user", "content": "Usage probe"}],
+                        },
+                    )
+                except Exception:
+                    probe_resp = None
+
+            if probe_resp is not None:
+                probe_headers = {k.lower(): v for k, v in probe_resp.headers.items()}
+                probe_rl_headers = {k: v for k, v in probe_headers.items() if "ratelimit" in k}
+                if probe_rl_headers:
+                    result.raw_headers = probe_rl_headers
+                    _apply_anthropic_ratelimit_headers(result, probe_headers)
+                if probe_resp.status_code == 200:
+                    try:
+                        payload = probe_resp.json()
+                        usage = payload.get("usage") if isinstance(payload, dict) else None
+                        if isinstance(usage, dict):
+                            in_tok = usage.get("input_tokens")
+                            out_tok = usage.get("output_tokens")
+                            if isinstance(in_tok, int):
+                                result.sample_input_tokens = in_tok
+                            if isinstance(out_tok, int):
+                                result.sample_output_tokens = out_tok
+                    except Exception:
+                        pass
+                elapsed_ms = probe_resp.elapsed.total_seconds() * 1000.0
+                result.sample_latency_ms = int(max(0.0, round(elapsed_ms)))
+
    return result


@ -296,6 +391,63 @@ async def _fetch_openai(api_key: str, base_url: str | None) -> ProviderUsageLive
    except Exception:
        pass

+    if result.tokens.limit is None and result.requests.limit is None:
+        probe_model = _pick_openai_probe_model(result.models)
+        if probe_model:
+            result.sample_model = probe_model
+            async with httpx.AsyncClient(timeout=REQUEST_TIMEOUT) as client:
+                try:
+                    probe_resp = await client.post(
+                        f"{base}/v1/chat/completions",
+                        headers={
+                            "Authorization": f"Bearer {api_key}",
+                            "content-type": "application/json",
+                        },
+                        json={
+                            "model": probe_model,
+                            "messages": [{"role": "user", "content": "Usage probe"}],
+                            "max_tokens": 1,
+                        },
+                    )
+                except Exception:
+                    probe_resp = None
+            if probe_resp is not None:
+                probe_headers = {k.lower(): v for k, v in probe_resp.headers.items()}
+                probe_rl_headers = {k: v for k, v in probe_headers.items() if "ratelimit" in k}
+                if probe_rl_headers:
+                    result.raw_headers = probe_rl_headers
+                    result.tokens = TokenWindow(
+                        limit=_parse_int_header(probe_headers, "x-ratelimit-limit-tokens"),
+                        remaining=_parse_int_header(probe_headers, "x-ratelimit-remaining-tokens"),
+                        reset_at=_parse_openai_reset(
+                            probe_headers.get("x-ratelimit-reset-tokens", "")
+                        ),
+                    )
+                    result.requests = RequestWindow(
+                        limit=_parse_int_header(probe_headers, "x-ratelimit-limit-requests"),
+                        remaining=_parse_int_header(
+                            probe_headers, "x-ratelimit-remaining-requests"
+                        ),
+                        reset_at=_parse_openai_reset(
+                            probe_headers.get("x-ratelimit-reset-requests", "")
+                        ),
+                    )
+                if probe_resp.status_code == 200:
+                    try:
+                        payload = probe_resp.json()
+                        usage = payload.get("usage") if isinstance(payload, dict) else None
+                        if isinstance(usage, dict):
+                            in_tok = usage.get("prompt_tokens")
+                            out_tok = usage.get("completion_tokens")
+                            if isinstance(in_tok, int):
+                                result.sample_input_tokens = in_tok
+                            if isinstance(out_tok, int):
+                                result.sample_output_tokens = out_tok
+                    except Exception:
+                        pass
+                elapsed_ms = probe_resp.elapsed.total_seconds() * 1000.0
+                result.sample_latency_ms = int(max(0.0, round(elapsed_ms)))
+
    return result


@ -331,6 +483,37 @@ async def _fetch_ollama(base_url: str | None, api_key: str | None) -> ProviderUs
    except Exception:
        pass

+    if result.models:
+        result.sample_model = result.models[0]
+        async with httpx.AsyncClient(timeout=REQUEST_TIMEOUT) as client:
+            try:
+                probe_resp = await client.post(
+                    f"{base}/api/generate",
+                    headers={**headers, "content-type": "application/json"},
+                    json={
+                        "model": result.sample_model,
+                        "prompt": "Usage probe",
+                        "stream": False,
+                        "options": {"num_predict": 1},
+                    },
+                )
+            except Exception:
+                probe_resp = None
+        if probe_resp is not None and probe_resp.status_code == 200:
+            try:
+                payload = probe_resp.json()
+                in_tok = payload.get("prompt_eval_count")
+                out_tok = payload.get("eval_count")
+                total_duration_ns = payload.get("total_duration")
+                if isinstance(in_tok, int):
+                    result.sample_input_tokens = in_tok
+                if isinstance(out_tok, int):
+                    result.sample_output_tokens = out_tok
+                if isinstance(total_duration_ns, int):
+                    result.sample_latency_ms = max(0, int(round(total_duration_ns / 1_000_000)))
+            except Exception:
+                pass
+
    return result


--- a/backend/migrations/versions/c4a1d2e3f4a5_make_main_session_key_nullable.py
+++ b/backend/migrations/versions/c4a1d2e3f4a5_make_main_session_key_nullable.py
@ -0,0 +1,34 @@
+"""Make main_session_key nullable on gateways.
+
+The column was NOT NULL but the ORM model didn't include it, causing
+INSERT failures when creating gateways via the API.  The field gets
+populated by ensure_main_agent() after the row exists, so it needs
+to be nullable during the initial INSERT.
+
+Revision ID: c4a1d2e3f4a5
+Revises: f7d8e9a0b1c2
+Create Date: 2026-05-21 04:25:00.000000
+
+"""
+
+from __future__ import annotations
+
+import sqlalchemy as sa
+from alembic import op
+
+
+# revision identifiers, used by Alembic.
+revision = "c4a1d2e3f4a5"
+down_revision = "f7d8e9a0b1c2"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    """Allow main_session_key to be NULL on initial insert."""
+    op.alter_column("gateways", "main_session_key", nullable=True)
+
+
+def downgrade() -> None:
+    """Revert main_session_key to NOT NULL."""
+    op.alter_column("gateways", "main_session_key", nullable=False)
--- a/backend/tests/test_provider_credentials_usage_api.py
+++ b/backend/tests/test_provider_credentials_usage_api.py
@ -89,6 +89,10 @@ async def test_usage_response_includes_rate_limit_header_names(monkeypatch: pyte
            checked_at=utcnow(),
            reachable=True,
        )
+        result.sample_model = "claude-sonnet-4-6"
+        result.sample_input_tokens = 9
+        result.sample_output_tokens = 1
+        result.sample_latency_ms = 123
        result.raw_headers = {
            "anthropic-ratelimit-requests-limit": "1000",
            "anthropic-ratelimit-requests-remaining": "999",
@ -117,6 +121,10 @@ async def test_usage_response_includes_rate_limit_header_names(monkeypatch: pyte
        data = response.json()
        assert data["provider"] == "anthropic"
        assert data["reachable"] is True
+        assert data["sample_model"] == "claude-sonnet-4-6"
+        assert data["sample_input_tokens"] == 9
+        assert data["sample_output_tokens"] == 1
+        assert data["sample_latency_ms"] == 123
        assert data["debug_rate_limit_headers"] == [
            "anthropic-ratelimit-requests-limit",
            "anthropic-ratelimit-requests-remaining",
@ -150,6 +158,10 @@ async def test_test_endpoint_returns_live_result(monkeypatch: pytest.MonkeyPatch
            reachable=True,
        )
        result.models = ["claude-sonnet-4-6"]
+        result.sample_model = "claude-sonnet-4-6"
+        result.sample_input_tokens = 8
+        result.sample_output_tokens = 1
+        result.sample_latency_ms = 111
        result.raw_headers = {
            "anthropic-ratelimit-tokens-limit": "100000",
        }
@ -184,6 +196,10 @@ async def test_test_endpoint_returns_live_result(monkeypatch: pytest.MonkeyPatch
        assert data["account_key"] == "Claude"
        assert data["reachable"] is True
        assert data["models"] == ["claude-sonnet-4-6"]
+        assert data["sample_model"] == "claude-sonnet-4-6"
+        assert data["sample_input_tokens"] == 8
+        assert data["sample_output_tokens"] == 1
+        assert data["sample_latency_ms"] == 111
        assert data["debug_rate_limit_headers"] == ["anthropic-ratelimit-tokens-limit"]
    finally:
        await engine.dispose()
--- a/frontend/src/api/generated/model/providerUsageLiveRead.ts
+++ b/frontend/src/api/generated/model/providerUsageLiveRead.ts
@ -20,4 +20,9 @@ export interface ProviderUsageLiveRead {
  input_tokens: TokenWindowRead;
  requests: RequestWindowRead;
  models?: string[];
+  sample_model?: string | null;
+  sample_input_tokens?: number | null;
+  sample_output_tokens?: number | null;
+  sample_latency_ms?: number | null;
+  debug_rate_limit_headers?: string[] | null;
 }
--- a/frontend/src/app/settings/ai-providers/page.tsx
+++ b/frontend/src/app/settings/ai-providers/page.tsx
@ -236,8 +236,17 @@ function CredentialForm({
              {testResult.reachable ? "Connection successful" : "Connection failed"}
            </p>
            <p className="mt-1 text-muted">
-              {testResult.error ?? `${testResult.models?.length ?? 0} model${(testResult.models?.length ?? 0) === 1 ? "" : "s"} returned`}
+              {testResult.error ?? (
+                testResult.sample_input_tokens != null || testResult.sample_output_tokens != null
+                  ? `Usage probe: in ${fmtTokens(testResult.sample_input_tokens)} · out ${fmtTokens(testResult.sample_output_tokens)}`
+                  : "Connected."
+              )}
            </p>
+            {testResult.sample_latency_ms != null && (
+              <p className="mt-1 text-muted">
+                Probe time: {fmtLatencyMs(testResult.sample_latency_ms)}
+              </p>
+            )}

          </div>
        )}
@ -267,6 +276,12 @@ function fmtResetMs(ms: number | null | undefined): string {
  return `${h}h ${m % 60}m`;
 }

+function fmtLatencyMs(ms: number | null | undefined): string {
+  if (ms == null || ms < 0) return "—";
+  if (ms < 1000) return `${ms}ms`;
+  return `${(ms / 1000).toFixed(2)}s`;
+}
+
 function UsageStrip({ credentialId, provider }: { credentialId: string; provider: string }) {
  const [usage, setUsage] = useState<ProviderUsageLiveRead | null>(null);
  const [loading, setLoading] = useState(true);
@ -318,30 +333,58 @@ function UsageStrip({ credentialId, provider }: { credentialId: string; provider
  const inputTok = usage.input_tokens;
  const req = usage.requests;
  const isOllama = provider === "ollama";
-  const modelCount = usage.models?.length ?? 0;

  return (
    <div className="mt-2 rounded-lg border border-[color:var(--border)] bg-[color:var(--surface)] p-2.5">
      {isOllama ? (
-        <div className="flex items-center gap-3 text-xs text-muted">
-          <span className="flex items-center gap-1 text-[color:var(--success)]">
-            <span className="inline-block h-1.5 w-1.5 rounded-full bg-[color:var(--success)]" />
-            Connected
-          </span>
-          {(usage.models?.length ?? 0) > 0 && (
-            <span>{usage.models!.length} model{usage.models!.length !== 1 ? "s" : ""} available</span>
+        <div className="space-y-1.5">
+          <div className="flex items-center gap-3 text-xs text-muted">
+            <span className="flex items-center gap-1 text-[color:var(--success)]">
+              <span className="inline-block h-1.5 w-1.5 rounded-full bg-[color:var(--success)]" />
+              Connected
+            </span>
+            {(usage.models?.length ?? 0) > 0 && (
+              <span>{usage.models!.length} model{usage.models!.length !== 1 ? "s" : ""} available</span>
+            )}
+            <button type="button" onClick={() => fetch(true)} className="ml-auto text-muted hover:text-strong">
+              <RefreshCw className={`h-3 w-3 ${loading ? "animate-spin" : ""}`} />
+            </button>
+          </div>
+          {(usage.sample_input_tokens != null || usage.sample_output_tokens != null) && (
+            <div className="flex items-center justify-between text-[11px] text-muted">
+              <span>Usage (last probe)</span>
+              <span className="tabular-nums text-strong">
+                in {fmtTokens(usage.sample_input_tokens)} · out {fmtTokens(usage.sample_output_tokens)}
+              </span>
+            </div>
          )}
-          <button type="button" onClick={() => fetch(true)} className="ml-auto text-muted hover:text-strong">
-            <RefreshCw className={`h-3 w-3 ${loading ? "animate-spin" : ""}`} />
-          </button>
+          {usage.sample_latency_ms != null && (
+            <div className="flex items-center justify-between text-[11px] text-muted">
+              <span>Time (last probe)</span>
+              <span className="tabular-nums text-strong">
+                {fmtLatencyMs(usage.sample_latency_ms)}
+              </span>
+            </div>
+          )}
+          {usage.sample_latency_ms != null && (
+            <div className="flex items-center justify-between text-[11px] text-muted">
+              <span>Time (last probe)</span>
+              <span className="tabular-nums text-strong">
+                {fmtLatencyMs(usage.sample_latency_ms)}
+              </span>
+            </div>
+          )}
+          <div className="flex items-center justify-between text-[11px] text-muted">
+            {lastFetched && <span>Updated {Math.round((Date.now() - lastFetched.getTime()) / 1000)}s ago</span>}
+          </div>
        </div>
      ) : (
        <div className="space-y-1.5">
-          {modelCount > 0 && (
+          {(usage.sample_input_tokens != null || usage.sample_output_tokens != null) && (
            <div className="flex items-center justify-between text-[11px] text-muted">
-              <span>Models</span>
+              <span>Usage (last probe)</span>
              <span className="tabular-nums text-strong">
-                {modelCount} available
+                in {fmtTokens(usage.sample_input_tokens)} · out {fmtTokens(usage.sample_output_tokens)}
              </span>
            </div>
          )}
@ -415,7 +458,7 @@ function UsageStrip({ credentialId, provider }: { credentialId: string; provider

          {tok.limit == null && inputTok.limit == null && req.limit == null && (
            <p className="text-[11px] text-muted">
-              Connected — provider did not return token/request limit headers for this key tier.
+              Connected — no token/request limit windows were returned for this key right now.
            </p>
          )}