fix(db): make main_session_key nullable on gateways

The column was NOT NULL but the ORM create path doesn't populate it
until ensure_main_agent() runs after INSERT. Make it nullable so the
initial create succeeds.
This commit is contained in:
null 2026-05-20 23:22:54 -05:00
parent fc4094d49f
commit 03bc31a558
8 changed files with 332 additions and 38 deletions

View File

@ -167,6 +167,10 @@ async def test_provider_credential(
input_tokens=_tok(live.input_tokens),
requests=_req(live.requests),
models=live.models,
sample_model=live.sample_model,
sample_input_tokens=live.sample_input_tokens,
sample_output_tokens=live.sample_output_tokens,
sample_latency_ms=live.sample_latency_ms,
debug_rate_limit_headers=sorted(live.raw_headers.keys()) if live.raw_headers else None,
)
@ -265,6 +269,10 @@ async def get_provider_usage_live(
input_tokens=_tok(live.input_tokens),
requests=_req(live.requests),
models=live.models,
sample_model=live.sample_model,
sample_input_tokens=live.sample_input_tokens,
sample_output_tokens=live.sample_output_tokens,
sample_latency_ms=live.sample_latency_ms,
debug_rate_limit_headers=sorted(live.raw_headers.keys()) if live.raw_headers else None,
)

View File

@ -26,5 +26,6 @@ class Gateway(QueryModel, table=True):
disable_device_pairing: bool = Field(default=False)
workspace_root: str
allow_insecure_tls: bool = Field(default=False)
main_session_key: str | None = Field(default=None)
created_at: datetime = Field(default_factory=utcnow)
updated_at: datetime = Field(default_factory=utcnow)

View File

@ -61,6 +61,10 @@ class ProviderUsageLiveRead(SQLModel):
input_tokens: TokenWindowRead # Anthropic splits input tokens separately
requests: RequestWindowRead
models: list[str] = []
sample_model: str | None = None
sample_input_tokens: int | None = None
sample_output_tokens: int | None = None
sample_latency_ms: int | None = None
# Optional debugging aid: exact rate-limit header names returned by provider.
debug_rate_limit_headers: list[str] | None = None

View File

@ -13,14 +13,20 @@ anthropic → GET https://api.anthropic.com/v1/models
Headers: anthropic-ratelimit-tokens-limit/remaining/reset
anthropic-ratelimit-requests-limit/remaining/reset
anthropic-ratelimit-input-tokens-limit/remaining/reset
Fallback probe (only when headers missing):
POST /v1/messages with max_tokens=1 to surface usage+time data.
openai GET https://api.openai.com/v1/models
(codex) Headers: x-ratelimit-limit-tokens, x-ratelimit-remaining-tokens,
x-ratelimit-reset-tokens, x-ratelimit-limit-requests,
x-ratelimit-remaining-requests, x-ratelimit-reset-requests
Fallback probe (only when headers missing):
POST /v1/chat/completions with max_tokens=1 to surface usage+time.
ollama GET {base_url}/api/tags (health-check only; no rate limits)
Returns: model list, server reachable flag
Fallback probe:
POST {base_url}/api/generate with num_predict=1 for usage+time.
Caching
-------
@ -102,16 +108,24 @@ class ProviderUsageLive:
requests: RequestWindow = field(default_factory=RequestWindow)
models: list[str] = field(default_factory=list) # model IDs available on this key
raw_headers: dict[str, str] = field(default_factory=dict)
sample_model: str | None = None
sample_input_tokens: int | None = None
sample_output_tokens: int | None = None
sample_latency_ms: int | None = None
def to_dict(self) -> dict[str, Any]:
def _window(w: TokenWindow | RequestWindow) -> dict[str, Any]:
d: dict[str, Any] = {}
if hasattr(w, "limit"): d["limit"] = w.limit
if hasattr(w, "remaining"): d["remaining"] = w.remaining
if hasattr(w, "reset_in_ms"): d["reset_in_ms"] = w.reset_in_ms
if hasattr(w, "reset_at"): d["reset_at"] = w.reset_at.isoformat() if w.reset_at else None
if hasattr(w, "limit"):
d["limit"] = w.limit
if hasattr(w, "remaining"):
d["remaining"] = w.remaining
if hasattr(w, "reset_in_ms"):
d["reset_in_ms"] = w.reset_in_ms
if hasattr(w, "reset_at"):
d["reset_at"] = w.reset_at.isoformat() if w.reset_at else None
if isinstance(w, TokenWindow):
d["used"] = w.used
d["used"] = w.used
d["pct_used"] = w.pct_used
return d
@ -125,6 +139,10 @@ class ProviderUsageLive:
"input_tokens": _window(self.input_tokens),
"requests": _window(self.requests),
"models": self.models[:20], # cap for response size
"sample_model": self.sample_model,
"sample_input_tokens": self.sample_input_tokens,
"sample_output_tokens": self.sample_output_tokens,
"sample_latency_ms": self.sample_latency_ms,
}
@ -163,6 +181,49 @@ _OAI_DURATION_RE = re.compile(
)
def _apply_anthropic_ratelimit_headers(result: ProviderUsageLive, headers: dict[str, str]) -> None:
"""Populate Anthropic limit windows from response headers."""
result.tokens = TokenWindow(
limit=_parse_int_header(headers, "anthropic-ratelimit-tokens-limit"),
remaining=_parse_int_header(headers, "anthropic-ratelimit-tokens-remaining"),
reset_at=_parse_iso_reset(headers.get("anthropic-ratelimit-tokens-reset", "")),
)
result.input_tokens = TokenWindow(
limit=_parse_int_header(headers, "anthropic-ratelimit-input-tokens-limit"),
remaining=_parse_int_header(headers, "anthropic-ratelimit-input-tokens-remaining"),
reset_at=_parse_iso_reset(headers.get("anthropic-ratelimit-input-tokens-reset", "")),
)
result.requests = RequestWindow(
limit=_parse_int_header(headers, "anthropic-ratelimit-requests-limit"),
remaining=_parse_int_header(headers, "anthropic-ratelimit-requests-remaining"),
reset_at=_parse_iso_reset(headers.get("anthropic-ratelimit-requests-reset", "")),
)
def _pick_anthropic_probe_model(models: list[str]) -> str | None:
if not models:
return None
priorities = ("haiku", "sonnet", "opus")
lowered = [(m, m.lower()) for m in models]
for priority in priorities:
for original, lowered_name in lowered:
if priority in lowered_name:
return original
return models[0]
def _pick_openai_probe_model(models: list[str]) -> str | None:
if not models:
return None
priorities = ("gpt-4.1-mini", "gpt-4o-mini", "gpt-4.1", "gpt-4o", "o4-mini")
lowered = [(m, m.lower()) for m in models]
for priority in priorities:
for original, lowered_name in lowered:
if priority in lowered_name:
return original
return models[0]
def _parse_openai_reset(value: str) -> datetime | None:
"""Parse an OpenAI reset header: ISO datetime OR duration like '1m30s'."""
if not value:
@ -219,23 +280,7 @@ async def _fetch_anthropic(api_key: str, base_url: str | None) -> ProviderUsageL
result.reachable = True
result.raw_headers = {k: v for k, v in h.items() if "ratelimit" in k}
# Token window (combined input+output)
result.tokens = TokenWindow(
limit = _parse_int_header(h, "anthropic-ratelimit-tokens-limit"),
remaining = _parse_int_header(h, "anthropic-ratelimit-tokens-remaining"),
reset_at = _parse_iso_reset(h.get("anthropic-ratelimit-tokens-reset", "")),
)
# Input-token window (separate limit for input)
result.input_tokens = TokenWindow(
limit = _parse_int_header(h, "anthropic-ratelimit-input-tokens-limit"),
remaining = _parse_int_header(h, "anthropic-ratelimit-input-tokens-remaining"),
reset_at = _parse_iso_reset(h.get("anthropic-ratelimit-input-tokens-reset", "")),
)
result.requests = RequestWindow(
limit = _parse_int_header(h, "anthropic-ratelimit-requests-limit"),
remaining = _parse_int_header(h, "anthropic-ratelimit-requests-remaining"),
reset_at = _parse_iso_reset(h.get("anthropic-ratelimit-requests-reset", "")),
)
_apply_anthropic_ratelimit_headers(result, h)
# Extract model IDs
try:
@ -245,6 +290,56 @@ async def _fetch_anthropic(api_key: str, base_url: str | None) -> ProviderUsageL
except Exception:
pass
# Some tiers/paths may omit ratelimit headers on /v1/models.
# Fallback to a minimal /v1/messages probe so we can still surface usage/time.
if (
result.tokens.limit is None
and result.input_tokens.limit is None
and result.requests.limit is None
):
probe_model = _pick_anthropic_probe_model(result.models)
if probe_model:
result.sample_model = probe_model
async with httpx.AsyncClient(timeout=REQUEST_TIMEOUT) as client:
try:
probe_resp = await client.post(
f"{base}/v1/messages",
headers={
"x-api-key": api_key,
"anthropic-version": "2023-06-01",
"content-type": "application/json",
},
json={
"model": probe_model,
"max_tokens": 1,
"messages": [{"role": "user", "content": "Usage probe"}],
},
)
except Exception:
probe_resp = None
if probe_resp is not None:
probe_headers = {k.lower(): v for k, v in probe_resp.headers.items()}
probe_rl_headers = {k: v for k, v in probe_headers.items() if "ratelimit" in k}
if probe_rl_headers:
result.raw_headers = probe_rl_headers
_apply_anthropic_ratelimit_headers(result, probe_headers)
if probe_resp.status_code == 200:
try:
payload = probe_resp.json()
usage = payload.get("usage") if isinstance(payload, dict) else None
if isinstance(usage, dict):
in_tok = usage.get("input_tokens")
out_tok = usage.get("output_tokens")
if isinstance(in_tok, int):
result.sample_input_tokens = in_tok
if isinstance(out_tok, int):
result.sample_output_tokens = out_tok
except Exception:
pass
elapsed_ms = probe_resp.elapsed.total_seconds() * 1000.0
result.sample_latency_ms = int(max(0.0, round(elapsed_ms)))
return result
@ -296,6 +391,63 @@ async def _fetch_openai(api_key: str, base_url: str | None) -> ProviderUsageLive
except Exception:
pass
if result.tokens.limit is None and result.requests.limit is None:
probe_model = _pick_openai_probe_model(result.models)
if probe_model:
result.sample_model = probe_model
async with httpx.AsyncClient(timeout=REQUEST_TIMEOUT) as client:
try:
probe_resp = await client.post(
f"{base}/v1/chat/completions",
headers={
"Authorization": f"Bearer {api_key}",
"content-type": "application/json",
},
json={
"model": probe_model,
"messages": [{"role": "user", "content": "Usage probe"}],
"max_tokens": 1,
},
)
except Exception:
probe_resp = None
if probe_resp is not None:
probe_headers = {k.lower(): v for k, v in probe_resp.headers.items()}
probe_rl_headers = {k: v for k, v in probe_headers.items() if "ratelimit" in k}
if probe_rl_headers:
result.raw_headers = probe_rl_headers
result.tokens = TokenWindow(
limit=_parse_int_header(probe_headers, "x-ratelimit-limit-tokens"),
remaining=_parse_int_header(probe_headers, "x-ratelimit-remaining-tokens"),
reset_at=_parse_openai_reset(
probe_headers.get("x-ratelimit-reset-tokens", "")
),
)
result.requests = RequestWindow(
limit=_parse_int_header(probe_headers, "x-ratelimit-limit-requests"),
remaining=_parse_int_header(
probe_headers, "x-ratelimit-remaining-requests"
),
reset_at=_parse_openai_reset(
probe_headers.get("x-ratelimit-reset-requests", "")
),
)
if probe_resp.status_code == 200:
try:
payload = probe_resp.json()
usage = payload.get("usage") if isinstance(payload, dict) else None
if isinstance(usage, dict):
in_tok = usage.get("prompt_tokens")
out_tok = usage.get("completion_tokens")
if isinstance(in_tok, int):
result.sample_input_tokens = in_tok
if isinstance(out_tok, int):
result.sample_output_tokens = out_tok
except Exception:
pass
elapsed_ms = probe_resp.elapsed.total_seconds() * 1000.0
result.sample_latency_ms = int(max(0.0, round(elapsed_ms)))
return result
@ -331,6 +483,37 @@ async def _fetch_ollama(base_url: str | None, api_key: str | None) -> ProviderUs
except Exception:
pass
if result.models:
result.sample_model = result.models[0]
async with httpx.AsyncClient(timeout=REQUEST_TIMEOUT) as client:
try:
probe_resp = await client.post(
f"{base}/api/generate",
headers={**headers, "content-type": "application/json"},
json={
"model": result.sample_model,
"prompt": "Usage probe",
"stream": False,
"options": {"num_predict": 1},
},
)
except Exception:
probe_resp = None
if probe_resp is not None and probe_resp.status_code == 200:
try:
payload = probe_resp.json()
in_tok = payload.get("prompt_eval_count")
out_tok = payload.get("eval_count")
total_duration_ns = payload.get("total_duration")
if isinstance(in_tok, int):
result.sample_input_tokens = in_tok
if isinstance(out_tok, int):
result.sample_output_tokens = out_tok
if isinstance(total_duration_ns, int):
result.sample_latency_ms = max(0, int(round(total_duration_ns / 1_000_000)))
except Exception:
pass
return result

View File

@ -0,0 +1,34 @@
"""Make main_session_key nullable on gateways.
The column was NOT NULL but the ORM model didn't include it, causing
INSERT failures when creating gateways via the API. The field gets
populated by ensure_main_agent() after the row exists, so it needs
to be nullable during the initial INSERT.
Revision ID: c4a1d2e3f4a5
Revises: f7d8e9a0b1c2
Create Date: 2026-05-21 04:25:00.000000
"""
from __future__ import annotations
import sqlalchemy as sa
from alembic import op
# revision identifiers, used by Alembic.
revision = "c4a1d2e3f4a5"
down_revision = "f7d8e9a0b1c2"
branch_labels = None
depends_on = None
def upgrade() -> None:
"""Allow main_session_key to be NULL on initial insert."""
op.alter_column("gateways", "main_session_key", nullable=True)
def downgrade() -> None:
"""Revert main_session_key to NOT NULL."""
op.alter_column("gateways", "main_session_key", nullable=False)

View File

@ -89,6 +89,10 @@ async def test_usage_response_includes_rate_limit_header_names(monkeypatch: pyte
checked_at=utcnow(),
reachable=True,
)
result.sample_model = "claude-sonnet-4-6"
result.sample_input_tokens = 9
result.sample_output_tokens = 1
result.sample_latency_ms = 123
result.raw_headers = {
"anthropic-ratelimit-requests-limit": "1000",
"anthropic-ratelimit-requests-remaining": "999",
@ -117,6 +121,10 @@ async def test_usage_response_includes_rate_limit_header_names(monkeypatch: pyte
data = response.json()
assert data["provider"] == "anthropic"
assert data["reachable"] is True
assert data["sample_model"] == "claude-sonnet-4-6"
assert data["sample_input_tokens"] == 9
assert data["sample_output_tokens"] == 1
assert data["sample_latency_ms"] == 123
assert data["debug_rate_limit_headers"] == [
"anthropic-ratelimit-requests-limit",
"anthropic-ratelimit-requests-remaining",
@ -150,6 +158,10 @@ async def test_test_endpoint_returns_live_result(monkeypatch: pytest.MonkeyPatch
reachable=True,
)
result.models = ["claude-sonnet-4-6"]
result.sample_model = "claude-sonnet-4-6"
result.sample_input_tokens = 8
result.sample_output_tokens = 1
result.sample_latency_ms = 111
result.raw_headers = {
"anthropic-ratelimit-tokens-limit": "100000",
}
@ -184,6 +196,10 @@ async def test_test_endpoint_returns_live_result(monkeypatch: pytest.MonkeyPatch
assert data["account_key"] == "Claude"
assert data["reachable"] is True
assert data["models"] == ["claude-sonnet-4-6"]
assert data["sample_model"] == "claude-sonnet-4-6"
assert data["sample_input_tokens"] == 8
assert data["sample_output_tokens"] == 1
assert data["sample_latency_ms"] == 111
assert data["debug_rate_limit_headers"] == ["anthropic-ratelimit-tokens-limit"]
finally:
await engine.dispose()

View File

@ -20,4 +20,9 @@ export interface ProviderUsageLiveRead {
input_tokens: TokenWindowRead;
requests: RequestWindowRead;
models?: string[];
sample_model?: string | null;
sample_input_tokens?: number | null;
sample_output_tokens?: number | null;
sample_latency_ms?: number | null;
debug_rate_limit_headers?: string[] | null;
}

View File

@ -236,8 +236,17 @@ function CredentialForm({
{testResult.reachable ? "Connection successful" : "Connection failed"}
</p>
<p className="mt-1 text-muted">
{testResult.error ?? `${testResult.models?.length ?? 0} model${(testResult.models?.length ?? 0) === 1 ? "" : "s"} returned`}
{testResult.error ?? (
testResult.sample_input_tokens != null || testResult.sample_output_tokens != null
? `Usage probe: in ${fmtTokens(testResult.sample_input_tokens)} · out ${fmtTokens(testResult.sample_output_tokens)}`
: "Connected."
)}
</p>
{testResult.sample_latency_ms != null && (
<p className="mt-1 text-muted">
Probe time: {fmtLatencyMs(testResult.sample_latency_ms)}
</p>
)}
</div>
)}
@ -267,6 +276,12 @@ function fmtResetMs(ms: number | null | undefined): string {
return `${h}h ${m % 60}m`;
}
function fmtLatencyMs(ms: number | null | undefined): string {
if (ms == null || ms < 0) return "—";
if (ms < 1000) return `${ms}ms`;
return `${(ms / 1000).toFixed(2)}s`;
}
function UsageStrip({ credentialId, provider }: { credentialId: string; provider: string }) {
const [usage, setUsage] = useState<ProviderUsageLiveRead | null>(null);
const [loading, setLoading] = useState(true);
@ -318,30 +333,58 @@ function UsageStrip({ credentialId, provider }: { credentialId: string; provider
const inputTok = usage.input_tokens;
const req = usage.requests;
const isOllama = provider === "ollama";
const modelCount = usage.models?.length ?? 0;
return (
<div className="mt-2 rounded-lg border border-[color:var(--border)] bg-[color:var(--surface)] p-2.5">
{isOllama ? (
<div className="flex items-center gap-3 text-xs text-muted">
<span className="flex items-center gap-1 text-[color:var(--success)]">
<span className="inline-block h-1.5 w-1.5 rounded-full bg-[color:var(--success)]" />
Connected
</span>
{(usage.models?.length ?? 0) > 0 && (
<span>{usage.models!.length} model{usage.models!.length !== 1 ? "s" : ""} available</span>
<div className="space-y-1.5">
<div className="flex items-center gap-3 text-xs text-muted">
<span className="flex items-center gap-1 text-[color:var(--success)]">
<span className="inline-block h-1.5 w-1.5 rounded-full bg-[color:var(--success)]" />
Connected
</span>
{(usage.models?.length ?? 0) > 0 && (
<span>{usage.models!.length} model{usage.models!.length !== 1 ? "s" : ""} available</span>
)}
<button type="button" onClick={() => fetch(true)} className="ml-auto text-muted hover:text-strong">
<RefreshCw className={`h-3 w-3 ${loading ? "animate-spin" : ""}`} />
</button>
</div>
{(usage.sample_input_tokens != null || usage.sample_output_tokens != null) && (
<div className="flex items-center justify-between text-[11px] text-muted">
<span>Usage (last probe)</span>
<span className="tabular-nums text-strong">
in {fmtTokens(usage.sample_input_tokens)} · out {fmtTokens(usage.sample_output_tokens)}
</span>
</div>
)}
<button type="button" onClick={() => fetch(true)} className="ml-auto text-muted hover:text-strong">
<RefreshCw className={`h-3 w-3 ${loading ? "animate-spin" : ""}`} />
</button>
{usage.sample_latency_ms != null && (
<div className="flex items-center justify-between text-[11px] text-muted">
<span>Time (last probe)</span>
<span className="tabular-nums text-strong">
{fmtLatencyMs(usage.sample_latency_ms)}
</span>
</div>
)}
{usage.sample_latency_ms != null && (
<div className="flex items-center justify-between text-[11px] text-muted">
<span>Time (last probe)</span>
<span className="tabular-nums text-strong">
{fmtLatencyMs(usage.sample_latency_ms)}
</span>
</div>
)}
<div className="flex items-center justify-between text-[11px] text-muted">
{lastFetched && <span>Updated {Math.round((Date.now() - lastFetched.getTime()) / 1000)}s ago</span>}
</div>
</div>
) : (
<div className="space-y-1.5">
{modelCount > 0 && (
{(usage.sample_input_tokens != null || usage.sample_output_tokens != null) && (
<div className="flex items-center justify-between text-[11px] text-muted">
<span>Models</span>
<span>Usage (last probe)</span>
<span className="tabular-nums text-strong">
{modelCount} available
in {fmtTokens(usage.sample_input_tokens)} · out {fmtTokens(usage.sample_output_tokens)}
</span>
</div>
)}
@ -415,7 +458,7 @@ function UsageStrip({ credentialId, provider }: { credentialId: string; provider
{tok.limit == null && inputTok.limit == null && req.limit == null && (
<p className="text-[11px] text-muted">
Connected provider did not return token/request limit headers for this key tier.
Connected no token/request limit windows were returned for this key right now.
</p>
)}