fix(db): make main_session_key nullable on gateways

The column was NOT NULL but the ORM create path doesn't populate it until ensure_main_agent() runs after INSERT. Make it nullable so the initial create succeeds.
2026-05-20 23:22:54 -05:00 · 2026-05-20 23:22:54 -05:00 · 03bc31a558
parent fc4094d49f
commit 03bc31a558
8 changed files with 332 additions and 38 deletions
--- a/backend/app/api/provider_credentials.py
+++ b/backend/app/api/provider_credentials.py
@ -167,6 +167,10 @@ async def test_provider_credential(
        input_tokens=_tok(live.input_tokens),
        requests=_req(live.requests),
        models=live.models,
        sample_model=live.sample_model,
        sample_input_tokens=live.sample_input_tokens,
        sample_output_tokens=live.sample_output_tokens,
        sample_latency_ms=live.sample_latency_ms,
        debug_rate_limit_headers=sorted(live.raw_headers.keys()) if live.raw_headers else None,
    )
@ -265,6 +269,10 @@ async def get_provider_usage_live(
        input_tokens=_tok(live.input_tokens),
        requests=_req(live.requests),
        models=live.models,
        sample_model=live.sample_model,
        sample_input_tokens=live.sample_input_tokens,
        sample_output_tokens=live.sample_output_tokens,
        sample_latency_ms=live.sample_latency_ms,
        debug_rate_limit_headers=sorted(live.raw_headers.keys()) if live.raw_headers else None,
    )
--- a/backend/app/models/gateways.py
+++ b/backend/app/models/gateways.py
@ -26,5 +26,6 @@ class Gateway(QueryModel, table=True):
    disable_device_pairing: bool = Field(default=False)
    workspace_root: str
    allow_insecure_tls: bool = Field(default=False)
    main_session_key: str | None = Field(default=None)
    created_at: datetime = Field(default_factory=utcnow)
    updated_at: datetime = Field(default_factory=utcnow)
--- a/backend/app/schemas/provider_credentials.py
+++ b/backend/app/schemas/provider_credentials.py
@ -61,6 +61,10 @@ class ProviderUsageLiveRead(SQLModel):
    input_tokens: TokenWindowRead   # Anthropic splits input tokens separately
    requests: RequestWindowRead
    models: list[str] = []
    sample_model: str | None = None
    sample_input_tokens: int | None = None
    sample_output_tokens: int | None = None
    sample_latency_ms: int | None = None
    # Optional debugging aid: exact rate-limit header names returned by provider.
    debug_rate_limit_headers: list[str] | None = None
--- a/backend/app/services/provider_usage.py
+++ b/backend/app/services/provider_usage.py
@ -13,14 +13,20 @@ anthropic  → GET https://api.anthropic.com/v1/models
             Headers: anthropic-ratelimit-tokens-limit/remaining/reset
                      anthropic-ratelimit-requests-limit/remaining/reset
                      anthropic-ratelimit-input-tokens-limit/remaining/reset
             Fallback probe (only when headers missing):
             POST /v1/messages with max_tokens=1 to surface usage+time data.
 openai     → GET https://api.openai.com/v1/models
 (codex)      Headers: x-ratelimit-limit-tokens, x-ratelimit-remaining-tokens,
                      x-ratelimit-reset-tokens, x-ratelimit-limit-requests,
                      x-ratelimit-remaining-requests, x-ratelimit-reset-requests
             Fallback probe (only when headers missing):
             POST /v1/chat/completions with max_tokens=1 to surface usage+time.
 ollama     → GET {base_url}/api/tags  (health-check only; no rate limits)
             Returns: model list, server reachable flag
             Fallback probe:
             POST {base_url}/api/generate with num_predict=1 for usage+time.
 Caching
 -------
@ -102,14 +108,22 @@ class ProviderUsageLive:
    requests: RequestWindow = field(default_factory=RequestWindow)
    models: list[str] = field(default_factory=list)   # model IDs available on this key
    raw_headers: dict[str, str] = field(default_factory=dict)
    sample_model: str | None = None
    sample_input_tokens: int | None = None
    sample_output_tokens: int | None = None
    sample_latency_ms: int | None = None
    def to_dict(self) -> dict[str, Any]:
        def _window(w: TokenWindow | RequestWindow) -> dict[str, Any]:
            d: dict[str, Any] = {}
-            if hasattr(w, "limit"):        d["limit"]       = w.limit
+            if hasattr(w, "limit"):
-            if hasattr(w, "remaining"):    d["remaining"]   = w.remaining
+                d["limit"] = w.limit
-            if hasattr(w, "reset_in_ms"): d["reset_in_ms"] = w.reset_in_ms
+            if hasattr(w, "remaining"):
-            if hasattr(w, "reset_at"):    d["reset_at"]    = w.reset_at.isoformat() if w.reset_at else None
+                d["remaining"] = w.remaining
            if hasattr(w, "reset_in_ms"):
                d["reset_in_ms"] = w.reset_in_ms
            if hasattr(w, "reset_at"):
                d["reset_at"] = w.reset_at.isoformat() if w.reset_at else None
            if isinstance(w, TokenWindow):
                d["used"] = w.used
                d["pct_used"] = w.pct_used
@ -125,6 +139,10 @@ class ProviderUsageLive:
            "input_tokens": _window(self.input_tokens),
            "requests":    _window(self.requests),
            "models":      self.models[:20],  # cap for response size
            "sample_model": self.sample_model,
            "sample_input_tokens": self.sample_input_tokens,
            "sample_output_tokens": self.sample_output_tokens,
            "sample_latency_ms": self.sample_latency_ms,
        }
@ -163,6 +181,49 @@ _OAI_DURATION_RE = re.compile(
 )
 def _apply_anthropic_ratelimit_headers(result: ProviderUsageLive, headers: dict[str, str]) -> None:
    """Populate Anthropic limit windows from response headers."""
    result.tokens = TokenWindow(
        limit=_parse_int_header(headers, "anthropic-ratelimit-tokens-limit"),
        remaining=_parse_int_header(headers, "anthropic-ratelimit-tokens-remaining"),
        reset_at=_parse_iso_reset(headers.get("anthropic-ratelimit-tokens-reset", "")),
    )
    result.input_tokens = TokenWindow(
        limit=_parse_int_header(headers, "anthropic-ratelimit-input-tokens-limit"),
        remaining=_parse_int_header(headers, "anthropic-ratelimit-input-tokens-remaining"),
        reset_at=_parse_iso_reset(headers.get("anthropic-ratelimit-input-tokens-reset", "")),
    )
    result.requests = RequestWindow(
        limit=_parse_int_header(headers, "anthropic-ratelimit-requests-limit"),
        remaining=_parse_int_header(headers, "anthropic-ratelimit-requests-remaining"),
        reset_at=_parse_iso_reset(headers.get("anthropic-ratelimit-requests-reset", "")),
    )
 def _pick_anthropic_probe_model(models: list[str]) -> str | None:
    if not models:
        return None
    priorities = ("haiku", "sonnet", "opus")
    lowered = [(m, m.lower()) for m in models]
    for priority in priorities:
        for original, lowered_name in lowered:
            if priority in lowered_name:
                return original
    return models[0]
 def _pick_openai_probe_model(models: list[str]) -> str | None:
    if not models:
        return None
    priorities = ("gpt-4.1-mini", "gpt-4o-mini", "gpt-4.1", "gpt-4o", "o4-mini")
    lowered = [(m, m.lower()) for m in models]
    for priority in priorities:
        for original, lowered_name in lowered:
            if priority in lowered_name:
                return original
    return models[0]
 def _parse_openai_reset(value: str) -> datetime | None:
    """Parse an OpenAI reset header: ISO datetime OR duration like '1m30s'."""
    if not value:
@ -219,23 +280,7 @@ async def _fetch_anthropic(api_key: str, base_url: str | None) -> ProviderUsageL
    result.reachable = True
    result.raw_headers = {k: v for k, v in h.items() if "ratelimit" in k}
-    # Token window (combined input+output)
+    _apply_anthropic_ratelimit_headers(result, h)
    result.tokens = TokenWindow(
        limit     = _parse_int_header(h, "anthropic-ratelimit-tokens-limit"),
        remaining = _parse_int_header(h, "anthropic-ratelimit-tokens-remaining"),
        reset_at  = _parse_iso_reset(h.get("anthropic-ratelimit-tokens-reset", "")),
    )
    # Input-token window (separate limit for input)
    result.input_tokens = TokenWindow(
        limit     = _parse_int_header(h, "anthropic-ratelimit-input-tokens-limit"),
        remaining = _parse_int_header(h, "anthropic-ratelimit-input-tokens-remaining"),
        reset_at  = _parse_iso_reset(h.get("anthropic-ratelimit-input-tokens-reset", "")),
    )
    result.requests = RequestWindow(
        limit     = _parse_int_header(h, "anthropic-ratelimit-requests-limit"),
        remaining = _parse_int_header(h, "anthropic-ratelimit-requests-remaining"),
        reset_at  = _parse_iso_reset(h.get("anthropic-ratelimit-requests-reset", "")),
    )
    # Extract model IDs
    try:
@ -245,6 +290,56 @@ async def _fetch_anthropic(api_key: str, base_url: str | None) -> ProviderUsageL
    except Exception:
        pass
    # Some tiers/paths may omit ratelimit headers on /v1/models.
    # Fallback to a minimal /v1/messages probe so we can still surface usage/time.
    if (
        result.tokens.limit is None
        and result.input_tokens.limit is None
        and result.requests.limit is None
    ):
        probe_model = _pick_anthropic_probe_model(result.models)
        if probe_model:
            result.sample_model = probe_model
            async with httpx.AsyncClient(timeout=REQUEST_TIMEOUT) as client:
                try:
                    probe_resp = await client.post(
                        f"{base}/v1/messages",
                        headers={
                            "x-api-key": api_key,
                            "anthropic-version": "2023-06-01",
                            "content-type": "application/json",
                        },
                        json={
                            "model": probe_model,
                            "max_tokens": 1,
                            "messages": [{"role": "user", "content": "Usage probe"}],
                        },
                    )
                except Exception:
                    probe_resp = None
            if probe_resp is not None:
                probe_headers = {k.lower(): v for k, v in probe_resp.headers.items()}
                probe_rl_headers = {k: v for k, v in probe_headers.items() if "ratelimit" in k}
                if probe_rl_headers:
                    result.raw_headers = probe_rl_headers
                    _apply_anthropic_ratelimit_headers(result, probe_headers)
                if probe_resp.status_code == 200:
                    try:
                        payload = probe_resp.json()
                        usage = payload.get("usage") if isinstance(payload, dict) else None
                        if isinstance(usage, dict):
                            in_tok = usage.get("input_tokens")
                            out_tok = usage.get("output_tokens")
                            if isinstance(in_tok, int):
                                result.sample_input_tokens = in_tok
                            if isinstance(out_tok, int):
                                result.sample_output_tokens = out_tok
                    except Exception:
                        pass
                elapsed_ms = probe_resp.elapsed.total_seconds() * 1000.0
                result.sample_latency_ms = int(max(0.0, round(elapsed_ms)))
    return result
@ -296,6 +391,63 @@ async def _fetch_openai(api_key: str, base_url: str | None) -> ProviderUsageLive
    except Exception:
        pass
    if result.tokens.limit is None and result.requests.limit is None:
        probe_model = _pick_openai_probe_model(result.models)
        if probe_model:
            result.sample_model = probe_model
            async with httpx.AsyncClient(timeout=REQUEST_TIMEOUT) as client:
                try:
                    probe_resp = await client.post(
                        f"{base}/v1/chat/completions",
                        headers={
                            "Authorization": f"Bearer {api_key}",
                            "content-type": "application/json",
                        },
                        json={
                            "model": probe_model,
                            "messages": [{"role": "user", "content": "Usage probe"}],
                            "max_tokens": 1,
                        },
                    )
                except Exception:
                    probe_resp = None
            if probe_resp is not None:
                probe_headers = {k.lower(): v for k, v in probe_resp.headers.items()}
                probe_rl_headers = {k: v for k, v in probe_headers.items() if "ratelimit" in k}
                if probe_rl_headers:
                    result.raw_headers = probe_rl_headers
                    result.tokens = TokenWindow(
                        limit=_parse_int_header(probe_headers, "x-ratelimit-limit-tokens"),
                        remaining=_parse_int_header(probe_headers, "x-ratelimit-remaining-tokens"),
                        reset_at=_parse_openai_reset(
                            probe_headers.get("x-ratelimit-reset-tokens", "")
                        ),
                    )
                    result.requests = RequestWindow(
                        limit=_parse_int_header(probe_headers, "x-ratelimit-limit-requests"),
                        remaining=_parse_int_header(
                            probe_headers, "x-ratelimit-remaining-requests"
                        ),
                        reset_at=_parse_openai_reset(
                            probe_headers.get("x-ratelimit-reset-requests", "")
                        ),
                    )
                if probe_resp.status_code == 200:
                    try:
                        payload = probe_resp.json()
                        usage = payload.get("usage") if isinstance(payload, dict) else None
                        if isinstance(usage, dict):
                            in_tok = usage.get("prompt_tokens")
                            out_tok = usage.get("completion_tokens")
                            if isinstance(in_tok, int):
                                result.sample_input_tokens = in_tok
                            if isinstance(out_tok, int):
                                result.sample_output_tokens = out_tok
                    except Exception:
                        pass
                elapsed_ms = probe_resp.elapsed.total_seconds() * 1000.0
                result.sample_latency_ms = int(max(0.0, round(elapsed_ms)))
    return result
@ -331,6 +483,37 @@ async def _fetch_ollama(base_url: str | None, api_key: str | None) -> ProviderUs
    except Exception:
        pass
    if result.models:
        result.sample_model = result.models[0]
        async with httpx.AsyncClient(timeout=REQUEST_TIMEOUT) as client:
            try:
                probe_resp = await client.post(
                    f"{base}/api/generate",
                    headers={**headers, "content-type": "application/json"},
                    json={
                        "model": result.sample_model,
                        "prompt": "Usage probe",
                        "stream": False,
                        "options": {"num_predict": 1},
                    },
                )
            except Exception:
                probe_resp = None
        if probe_resp is not None and probe_resp.status_code == 200:
            try:
                payload = probe_resp.json()
                in_tok = payload.get("prompt_eval_count")
                out_tok = payload.get("eval_count")
                total_duration_ns = payload.get("total_duration")
                if isinstance(in_tok, int):
                    result.sample_input_tokens = in_tok
                if isinstance(out_tok, int):
                    result.sample_output_tokens = out_tok
                if isinstance(total_duration_ns, int):
                    result.sample_latency_ms = max(0, int(round(total_duration_ns / 1_000_000)))
            except Exception:
                pass
    return result
--- a/backend/migrations/versions/c4a1d2e3f4a5_make_main_session_key_nullable.py
+++ b/backend/migrations/versions/c4a1d2e3f4a5_make_main_session_key_nullable.py
@ -0,0 +1,34 @@
 """Make main_session_key nullable on gateways.
 The column was NOT NULL but the ORM model didn't include it, causing
 INSERT failures when creating gateways via the API.  The field gets
 populated by ensure_main_agent() after the row exists, so it needs
 to be nullable during the initial INSERT.
 Revision ID: c4a1d2e3f4a5
 Revises: f7d8e9a0b1c2
 Create Date: 2026-05-21 04:25:00.000000
 """
 from __future__ import annotations
 import sqlalchemy as sa
 from alembic import op
 # revision identifiers, used by Alembic.
 revision = "c4a1d2e3f4a5"
 down_revision = "f7d8e9a0b1c2"
 branch_labels = None
 depends_on = None
 def upgrade() -> None:
    """Allow main_session_key to be NULL on initial insert."""
    op.alter_column("gateways", "main_session_key", nullable=True)
 def downgrade() -> None:
    """Revert main_session_key to NOT NULL."""
    op.alter_column("gateways", "main_session_key", nullable=False)
--- a/backend/tests/test_provider_credentials_usage_api.py
+++ b/backend/tests/test_provider_credentials_usage_api.py
@ -89,6 +89,10 @@ async def test_usage_response_includes_rate_limit_header_names(monkeypatch: pyte
            checked_at=utcnow(),
            reachable=True,
        )
        result.sample_model = "claude-sonnet-4-6"
        result.sample_input_tokens = 9
        result.sample_output_tokens = 1
        result.sample_latency_ms = 123
        result.raw_headers = {
            "anthropic-ratelimit-requests-limit": "1000",
            "anthropic-ratelimit-requests-remaining": "999",
@ -117,6 +121,10 @@ async def test_usage_response_includes_rate_limit_header_names(monkeypatch: pyte
        data = response.json()
        assert data["provider"] == "anthropic"
        assert data["reachable"] is True
        assert data["sample_model"] == "claude-sonnet-4-6"
        assert data["sample_input_tokens"] == 9
        assert data["sample_output_tokens"] == 1
        assert data["sample_latency_ms"] == 123
        assert data["debug_rate_limit_headers"] == [
            "anthropic-ratelimit-requests-limit",
            "anthropic-ratelimit-requests-remaining",
@ -150,6 +158,10 @@ async def test_test_endpoint_returns_live_result(monkeypatch: pytest.MonkeyPatch
            reachable=True,
        )
        result.models = ["claude-sonnet-4-6"]
        result.sample_model = "claude-sonnet-4-6"
        result.sample_input_tokens = 8
        result.sample_output_tokens = 1
        result.sample_latency_ms = 111
        result.raw_headers = {
            "anthropic-ratelimit-tokens-limit": "100000",
        }
@ -184,6 +196,10 @@ async def test_test_endpoint_returns_live_result(monkeypatch: pytest.MonkeyPatch
        assert data["account_key"] == "Claude"
        assert data["reachable"] is True
        assert data["models"] == ["claude-sonnet-4-6"]
        assert data["sample_model"] == "claude-sonnet-4-6"
        assert data["sample_input_tokens"] == 8
        assert data["sample_output_tokens"] == 1
        assert data["sample_latency_ms"] == 111
        assert data["debug_rate_limit_headers"] == ["anthropic-ratelimit-tokens-limit"]
    finally:
        await engine.dispose()
--- a/frontend/src/api/generated/model/providerUsageLiveRead.ts
+++ b/frontend/src/api/generated/model/providerUsageLiveRead.ts
@ -20,4 +20,9 @@ export interface ProviderUsageLiveRead {
  input_tokens: TokenWindowRead;
  requests: RequestWindowRead;
  models?: string[];
  sample_model?: string | null;
  sample_input_tokens?: number | null;
  sample_output_tokens?: number | null;
  sample_latency_ms?: number | null;
  debug_rate_limit_headers?: string[] | null;
 }
--- a/frontend/src/app/settings/ai-providers/page.tsx
+++ b/frontend/src/app/settings/ai-providers/page.tsx
@ -236,8 +236,17 @@ function CredentialForm({
              {testResult.reachable ? "Connection successful" : "Connection failed"}
            </p>
            <p className="mt-1 text-muted">
-              {testResult.error ?? `${testResult.models?.length ?? 0} model${(testResult.models?.length ?? 0) === 1 ? "" : "s"} returned`}
+              {testResult.error ?? (
                testResult.sample_input_tokens != null || testResult.sample_output_tokens != null
                  ? `Usage probe: in ${fmtTokens(testResult.sample_input_tokens)} · out ${fmtTokens(testResult.sample_output_tokens)}`
                  : "Connected."
              )}
            </p>
            {testResult.sample_latency_ms != null && (
              <p className="mt-1 text-muted">
                Probe time: {fmtLatencyMs(testResult.sample_latency_ms)}
              </p>
            )}
          </div>
        )}
@ -267,6 +276,12 @@ function fmtResetMs(ms: number | null | undefined): string {
  return `${h}h ${m % 60}m`;
 }
 function fmtLatencyMs(ms: number | null | undefined): string {
  if (ms == null || ms < 0) return "—";
  if (ms < 1000) return `${ms}ms`;
  return `${(ms / 1000).toFixed(2)}s`;
 }
 function UsageStrip({ credentialId, provider }: { credentialId: string; provider: string }) {
  const [usage, setUsage] = useState<ProviderUsageLiveRead | null>(null);
  const [loading, setLoading] = useState(true);
@ -318,11 +333,11 @@ function UsageStrip({ credentialId, provider }: { credentialId: string; provider
  const inputTok = usage.input_tokens;
  const req = usage.requests;
  const isOllama = provider === "ollama";
  const modelCount = usage.models?.length ?? 0;
  return (
    <div className="mt-2 rounded-lg border border-[color:var(--border)] bg-[color:var(--surface)] p-2.5">
      {isOllama ? (
        <div className="space-y-1.5">
          <div className="flex items-center gap-3 text-xs text-muted">
            <span className="flex items-center gap-1 text-[color:var(--success)]">
              <span className="inline-block h-1.5 w-1.5 rounded-full bg-[color:var(--success)]" />
@ -335,13 +350,41 @@ function UsageStrip({ credentialId, provider }: { credentialId: string; provider
              <RefreshCw className={`h-3 w-3 ${loading ? "animate-spin" : ""}`} />
            </button>
          </div>
          {(usage.sample_input_tokens != null || usage.sample_output_tokens != null) && (
            <div className="flex items-center justify-between text-[11px] text-muted">
              <span>Usage (last probe)</span>
              <span className="tabular-nums text-strong">
                in {fmtTokens(usage.sample_input_tokens)} · out {fmtTokens(usage.sample_output_tokens)}
              </span>
            </div>
          )}
          {usage.sample_latency_ms != null && (
            <div className="flex items-center justify-between text-[11px] text-muted">
              <span>Time (last probe)</span>
              <span className="tabular-nums text-strong">
                {fmtLatencyMs(usage.sample_latency_ms)}
              </span>
            </div>
          )}
          {usage.sample_latency_ms != null && (
            <div className="flex items-center justify-between text-[11px] text-muted">
              <span>Time (last probe)</span>
              <span className="tabular-nums text-strong">
                {fmtLatencyMs(usage.sample_latency_ms)}
              </span>
            </div>
          )}
          <div className="flex items-center justify-between text-[11px] text-muted">
            {lastFetched && <span>Updated {Math.round((Date.now() - lastFetched.getTime()) / 1000)}s ago</span>}
          </div>
        </div>
      ) : (
        <div className="space-y-1.5">
-          {modelCount > 0 && (
+          {(usage.sample_input_tokens != null || usage.sample_output_tokens != null) && (
            <div className="flex items-center justify-between text-[11px] text-muted">
-              <span>Models</span>
+              <span>Usage (last probe)</span>
              <span className="tabular-nums text-strong">
-                {modelCount} available
+                in {fmtTokens(usage.sample_input_tokens)} · out {fmtTokens(usage.sample_output_tokens)}
              </span>
            </div>
          )}
@ -415,7 +458,7 @@ function UsageStrip({ credentialId, provider }: { credentialId: string; provider
          {tok.limit == null && inputTok.limit == null && req.limit == null && (
            <p className="text-[11px] text-muted">
-              Connected — provider did not return token/request limit headers for this key tier.
+              Connected — no token/request limit windows were returned for this key right now.
            </p>
          )}