From 03bc31a558745353dc0088845cfe01052e1662fe Mon Sep 17 00:00:00 2001
From: null <koga.industries@gmail.com>
Date: Wed, 20 May 2026 23:22:54 -0500
Subject: [PATCH] fix(db): make main_session_key nullable on gateways

The column was NOT NULL but the ORM create path doesn't populate it
until ensure_main_agent() runs after INSERT. Make it nullable so the
initial create succeeds.
---
 backend/app/api/provider_credentials.py       |   8 +
 backend/app/models/gateways.py                |   1 +
 backend/app/schemas/provider_credentials.py   |   4 +
 backend/app/services/provider_usage.py        | 227 ++++++++++++++++--
 ...d2e3f4a5_make_main_session_key_nullable.py |  34 +++
 .../test_provider_credentials_usage_api.py    |  16 ++
 .../generated/model/providerUsageLiveRead.ts  |   5 +
 .../src/app/settings/ai-providers/page.tsx    |  75 ++++--
 8 files changed, 332 insertions(+), 38 deletions(-)
 create mode 100644 backend/migrations/versions/c4a1d2e3f4a5_make_main_session_key_nullable.py

diff --git a/backend/app/api/provider_credentials.py b/backend/app/api/provider_credentials.py
index 2734639..1a4db21 100644
--- a/backend/app/api/provider_credentials.py
+++ b/backend/app/api/provider_credentials.py
@@ -167,6 +167,10 @@ async def test_provider_credential(
         input_tokens=_tok(live.input_tokens),
         requests=_req(live.requests),
         models=live.models,
+        sample_model=live.sample_model,
+        sample_input_tokens=live.sample_input_tokens,
+        sample_output_tokens=live.sample_output_tokens,
+        sample_latency_ms=live.sample_latency_ms,
         debug_rate_limit_headers=sorted(live.raw_headers.keys()) if live.raw_headers else None,
     )
 
@@ -265,6 +269,10 @@ async def get_provider_usage_live(
         input_tokens=_tok(live.input_tokens),
         requests=_req(live.requests),
         models=live.models,
+        sample_model=live.sample_model,
+        sample_input_tokens=live.sample_input_tokens,
+        sample_output_tokens=live.sample_output_tokens,
+        sample_latency_ms=live.sample_latency_ms,
         debug_rate_limit_headers=sorted(live.raw_headers.keys()) if live.raw_headers else None,
     )
 
diff --git a/backend/app/models/gateways.py b/backend/app/models/gateways.py
index 3451a10..68e6776 100644
--- a/backend/app/models/gateways.py
+++ b/backend/app/models/gateways.py
@@ -26,5 +26,6 @@ class Gateway(QueryModel, table=True):
     disable_device_pairing: bool = Field(default=False)
     workspace_root: str
     allow_insecure_tls: bool = Field(default=False)
+    main_session_key: str | None = Field(default=None)
     created_at: datetime = Field(default_factory=utcnow)
     updated_at: datetime = Field(default_factory=utcnow)
diff --git a/backend/app/schemas/provider_credentials.py b/backend/app/schemas/provider_credentials.py
index 46dba5b..25a40d4 100644
--- a/backend/app/schemas/provider_credentials.py
+++ b/backend/app/schemas/provider_credentials.py
@@ -61,6 +61,10 @@ class ProviderUsageLiveRead(SQLModel):
     input_tokens: TokenWindowRead   # Anthropic splits input tokens separately
     requests: RequestWindowRead
     models: list[str] = []
+    sample_model: str | None = None
+    sample_input_tokens: int | None = None
+    sample_output_tokens: int | None = None
+    sample_latency_ms: int | None = None
     # Optional debugging aid: exact rate-limit header names returned by provider.
     debug_rate_limit_headers: list[str] | None = None
 
diff --git a/backend/app/services/provider_usage.py b/backend/app/services/provider_usage.py
index 6dcae63..d2ce6c2 100644
--- a/backend/app/services/provider_usage.py
+++ b/backend/app/services/provider_usage.py
@@ -13,14 +13,20 @@ anthropic  → GET https://api.anthropic.com/v1/models
              Headers: anthropic-ratelimit-tokens-limit/remaining/reset
                       anthropic-ratelimit-requests-limit/remaining/reset
                       anthropic-ratelimit-input-tokens-limit/remaining/reset
+             Fallback probe (only when headers missing):
+             POST /v1/messages with max_tokens=1 to surface usage+time data.
 
 openai     → GET https://api.openai.com/v1/models
 (codex)      Headers: x-ratelimit-limit-tokens, x-ratelimit-remaining-tokens,
                       x-ratelimit-reset-tokens, x-ratelimit-limit-requests,
                       x-ratelimit-remaining-requests, x-ratelimit-reset-requests
+             Fallback probe (only when headers missing):
+             POST /v1/chat/completions with max_tokens=1 to surface usage+time.
 
 ollama     → GET {base_url}/api/tags  (health-check only; no rate limits)
              Returns: model list, server reachable flag
+             Fallback probe:
+             POST {base_url}/api/generate with num_predict=1 for usage+time.
 
 Caching
 -------
@@ -102,16 +108,24 @@ class ProviderUsageLive:
     requests: RequestWindow = field(default_factory=RequestWindow)
     models: list[str] = field(default_factory=list)   # model IDs available on this key
     raw_headers: dict[str, str] = field(default_factory=dict)
+    sample_model: str | None = None
+    sample_input_tokens: int | None = None
+    sample_output_tokens: int | None = None
+    sample_latency_ms: int | None = None
 
     def to_dict(self) -> dict[str, Any]:
         def _window(w: TokenWindow | RequestWindow) -> dict[str, Any]:
             d: dict[str, Any] = {}
-            if hasattr(w, "limit"):        d["limit"]       = w.limit
-            if hasattr(w, "remaining"):    d["remaining"]   = w.remaining
-            if hasattr(w, "reset_in_ms"): d["reset_in_ms"] = w.reset_in_ms
-            if hasattr(w, "reset_at"):    d["reset_at"]    = w.reset_at.isoformat() if w.reset_at else None
+            if hasattr(w, "limit"):
+                d["limit"] = w.limit
+            if hasattr(w, "remaining"):
+                d["remaining"] = w.remaining
+            if hasattr(w, "reset_in_ms"):
+                d["reset_in_ms"] = w.reset_in_ms
+            if hasattr(w, "reset_at"):
+                d["reset_at"] = w.reset_at.isoformat() if w.reset_at else None
             if isinstance(w, TokenWindow):
-                d["used"]     = w.used
+                d["used"] = w.used
                 d["pct_used"] = w.pct_used
             return d
 
@@ -125,6 +139,10 @@ class ProviderUsageLive:
             "input_tokens": _window(self.input_tokens),
             "requests":    _window(self.requests),
             "models":      self.models[:20],  # cap for response size
+            "sample_model": self.sample_model,
+            "sample_input_tokens": self.sample_input_tokens,
+            "sample_output_tokens": self.sample_output_tokens,
+            "sample_latency_ms": self.sample_latency_ms,
         }
 
 
@@ -163,6 +181,49 @@ _OAI_DURATION_RE = re.compile(
 )
 
 
+def _apply_anthropic_ratelimit_headers(result: ProviderUsageLive, headers: dict[str, str]) -> None:
+    """Populate Anthropic limit windows from response headers."""
+    result.tokens = TokenWindow(
+        limit=_parse_int_header(headers, "anthropic-ratelimit-tokens-limit"),
+        remaining=_parse_int_header(headers, "anthropic-ratelimit-tokens-remaining"),
+        reset_at=_parse_iso_reset(headers.get("anthropic-ratelimit-tokens-reset", "")),
+    )
+    result.input_tokens = TokenWindow(
+        limit=_parse_int_header(headers, "anthropic-ratelimit-input-tokens-limit"),
+        remaining=_parse_int_header(headers, "anthropic-ratelimit-input-tokens-remaining"),
+        reset_at=_parse_iso_reset(headers.get("anthropic-ratelimit-input-tokens-reset", "")),
+    )
+    result.requests = RequestWindow(
+        limit=_parse_int_header(headers, "anthropic-ratelimit-requests-limit"),
+        remaining=_parse_int_header(headers, "anthropic-ratelimit-requests-remaining"),
+        reset_at=_parse_iso_reset(headers.get("anthropic-ratelimit-requests-reset", "")),
+    )
+
+
+def _pick_anthropic_probe_model(models: list[str]) -> str | None:
+    if not models:
+        return None
+    priorities = ("haiku", "sonnet", "opus")
+    lowered = [(m, m.lower()) for m in models]
+    for priority in priorities:
+        for original, lowered_name in lowered:
+            if priority in lowered_name:
+                return original
+    return models[0]
+
+
+def _pick_openai_probe_model(models: list[str]) -> str | None:
+    if not models:
+        return None
+    priorities = ("gpt-4.1-mini", "gpt-4o-mini", "gpt-4.1", "gpt-4o", "o4-mini")
+    lowered = [(m, m.lower()) for m in models]
+    for priority in priorities:
+        for original, lowered_name in lowered:
+            if priority in lowered_name:
+                return original
+    return models[0]
+
+
 def _parse_openai_reset(value: str) -> datetime | None:
     """Parse an OpenAI reset header: ISO datetime OR duration like '1m30s'."""
     if not value:
@@ -219,23 +280,7 @@ async def _fetch_anthropic(api_key: str, base_url: str | None) -> ProviderUsageL
     result.reachable = True
     result.raw_headers = {k: v for k, v in h.items() if "ratelimit" in k}
 
-    # Token window (combined input+output)
-    result.tokens = TokenWindow(
-        limit     = _parse_int_header(h, "anthropic-ratelimit-tokens-limit"),
-        remaining = _parse_int_header(h, "anthropic-ratelimit-tokens-remaining"),
-        reset_at  = _parse_iso_reset(h.get("anthropic-ratelimit-tokens-reset", "")),
-    )
-    # Input-token window (separate limit for input)
-    result.input_tokens = TokenWindow(
-        limit     = _parse_int_header(h, "anthropic-ratelimit-input-tokens-limit"),
-        remaining = _parse_int_header(h, "anthropic-ratelimit-input-tokens-remaining"),
-        reset_at  = _parse_iso_reset(h.get("anthropic-ratelimit-input-tokens-reset", "")),
-    )
-    result.requests = RequestWindow(
-        limit     = _parse_int_header(h, "anthropic-ratelimit-requests-limit"),
-        remaining = _parse_int_header(h, "anthropic-ratelimit-requests-remaining"),
-        reset_at  = _parse_iso_reset(h.get("anthropic-ratelimit-requests-reset", "")),
-    )
+    _apply_anthropic_ratelimit_headers(result, h)
 
     # Extract model IDs
     try:
@@ -245,6 +290,56 @@ async def _fetch_anthropic(api_key: str, base_url: str | None) -> ProviderUsageL
     except Exception:
         pass
 
+    # Some tiers/paths may omit ratelimit headers on /v1/models.
+    # Fallback to a minimal /v1/messages probe so we can still surface usage/time.
+    if (
+        result.tokens.limit is None
+        and result.input_tokens.limit is None
+        and result.requests.limit is None
+    ):
+        probe_model = _pick_anthropic_probe_model(result.models)
+        if probe_model:
+            result.sample_model = probe_model
+            async with httpx.AsyncClient(timeout=REQUEST_TIMEOUT) as client:
+                try:
+                    probe_resp = await client.post(
+                        f"{base}/v1/messages",
+                        headers={
+                            "x-api-key": api_key,
+                            "anthropic-version": "2023-06-01",
+                            "content-type": "application/json",
+                        },
+                        json={
+                            "model": probe_model,
+                            "max_tokens": 1,
+                            "messages": [{"role": "user", "content": "Usage probe"}],
+                        },
+                    )
+                except Exception:
+                    probe_resp = None
+
+            if probe_resp is not None:
+                probe_headers = {k.lower(): v for k, v in probe_resp.headers.items()}
+                probe_rl_headers = {k: v for k, v in probe_headers.items() if "ratelimit" in k}
+                if probe_rl_headers:
+                    result.raw_headers = probe_rl_headers
+                    _apply_anthropic_ratelimit_headers(result, probe_headers)
+                if probe_resp.status_code == 200:
+                    try:
+                        payload = probe_resp.json()
+                        usage = payload.get("usage") if isinstance(payload, dict) else None
+                        if isinstance(usage, dict):
+                            in_tok = usage.get("input_tokens")
+                            out_tok = usage.get("output_tokens")
+                            if isinstance(in_tok, int):
+                                result.sample_input_tokens = in_tok
+                            if isinstance(out_tok, int):
+                                result.sample_output_tokens = out_tok
+                    except Exception:
+                        pass
+                elapsed_ms = probe_resp.elapsed.total_seconds() * 1000.0
+                result.sample_latency_ms = int(max(0.0, round(elapsed_ms)))
+
     return result
 
 
@@ -296,6 +391,63 @@ async def _fetch_openai(api_key: str, base_url: str | None) -> ProviderUsageLive
     except Exception:
         pass
 
+    if result.tokens.limit is None and result.requests.limit is None:
+        probe_model = _pick_openai_probe_model(result.models)
+        if probe_model:
+            result.sample_model = probe_model
+            async with httpx.AsyncClient(timeout=REQUEST_TIMEOUT) as client:
+                try:
+                    probe_resp = await client.post(
+                        f"{base}/v1/chat/completions",
+                        headers={
+                            "Authorization": f"Bearer {api_key}",
+                            "content-type": "application/json",
+                        },
+                        json={
+                            "model": probe_model,
+                            "messages": [{"role": "user", "content": "Usage probe"}],
+                            "max_tokens": 1,
+                        },
+                    )
+                except Exception:
+                    probe_resp = None
+            if probe_resp is not None:
+                probe_headers = {k.lower(): v for k, v in probe_resp.headers.items()}
+                probe_rl_headers = {k: v for k, v in probe_headers.items() if "ratelimit" in k}
+                if probe_rl_headers:
+                    result.raw_headers = probe_rl_headers
+                    result.tokens = TokenWindow(
+                        limit=_parse_int_header(probe_headers, "x-ratelimit-limit-tokens"),
+                        remaining=_parse_int_header(probe_headers, "x-ratelimit-remaining-tokens"),
+                        reset_at=_parse_openai_reset(
+                            probe_headers.get("x-ratelimit-reset-tokens", "")
+                        ),
+                    )
+                    result.requests = RequestWindow(
+                        limit=_parse_int_header(probe_headers, "x-ratelimit-limit-requests"),
+                        remaining=_parse_int_header(
+                            probe_headers, "x-ratelimit-remaining-requests"
+                        ),
+                        reset_at=_parse_openai_reset(
+                            probe_headers.get("x-ratelimit-reset-requests", "")
+                        ),
+                    )
+                if probe_resp.status_code == 200:
+                    try:
+                        payload = probe_resp.json()
+                        usage = payload.get("usage") if isinstance(payload, dict) else None
+                        if isinstance(usage, dict):
+                            in_tok = usage.get("prompt_tokens")
+                            out_tok = usage.get("completion_tokens")
+                            if isinstance(in_tok, int):
+                                result.sample_input_tokens = in_tok
+                            if isinstance(out_tok, int):
+                                result.sample_output_tokens = out_tok
+                    except Exception:
+                        pass
+                elapsed_ms = probe_resp.elapsed.total_seconds() * 1000.0
+                result.sample_latency_ms = int(max(0.0, round(elapsed_ms)))
+
     return result
 
 
@@ -331,6 +483,37 @@ async def _fetch_ollama(base_url: str | None, api_key: str | None) -> ProviderUs
     except Exception:
         pass
 
+    if result.models:
+        result.sample_model = result.models[0]
+        async with httpx.AsyncClient(timeout=REQUEST_TIMEOUT) as client:
+            try:
+                probe_resp = await client.post(
+                    f"{base}/api/generate",
+                    headers={**headers, "content-type": "application/json"},
+                    json={
+                        "model": result.sample_model,
+                        "prompt": "Usage probe",
+                        "stream": False,
+                        "options": {"num_predict": 1},
+                    },
+                )
+            except Exception:
+                probe_resp = None
+        if probe_resp is not None and probe_resp.status_code == 200:
+            try:
+                payload = probe_resp.json()
+                in_tok = payload.get("prompt_eval_count")
+                out_tok = payload.get("eval_count")
+                total_duration_ns = payload.get("total_duration")
+                if isinstance(in_tok, int):
+                    result.sample_input_tokens = in_tok
+                if isinstance(out_tok, int):
+                    result.sample_output_tokens = out_tok
+                if isinstance(total_duration_ns, int):
+                    result.sample_latency_ms = max(0, int(round(total_duration_ns / 1_000_000)))
+            except Exception:
+                pass
+
     return result
 
 
diff --git a/backend/migrations/versions/c4a1d2e3f4a5_make_main_session_key_nullable.py b/backend/migrations/versions/c4a1d2e3f4a5_make_main_session_key_nullable.py
new file mode 100644
index 0000000..6aba4b3
--- /dev/null
+++ b/backend/migrations/versions/c4a1d2e3f4a5_make_main_session_key_nullable.py
@@ -0,0 +1,34 @@
+"""Make main_session_key nullable on gateways.
+
+The column was NOT NULL but the ORM model didn't include it, causing
+INSERT failures when creating gateways via the API.  The field gets
+populated by ensure_main_agent() after the row exists, so it needs
+to be nullable during the initial INSERT.
+
+Revision ID: c4a1d2e3f4a5
+Revises: f7d8e9a0b1c2
+Create Date: 2026-05-21 04:25:00.000000
+
+"""
+
+from __future__ import annotations
+
+import sqlalchemy as sa
+from alembic import op
+
+
+# revision identifiers, used by Alembic.
+revision = "c4a1d2e3f4a5"
+down_revision = "f7d8e9a0b1c2"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    """Allow main_session_key to be NULL on initial insert."""
+    op.alter_column("gateways", "main_session_key", nullable=True)
+
+
+def downgrade() -> None:
+    """Revert main_session_key to NOT NULL."""
+    op.alter_column("gateways", "main_session_key", nullable=False)
\ No newline at end of file
diff --git a/backend/tests/test_provider_credentials_usage_api.py b/backend/tests/test_provider_credentials_usage_api.py
index 3465784..5754844 100644
--- a/backend/tests/test_provider_credentials_usage_api.py
+++ b/backend/tests/test_provider_credentials_usage_api.py
@@ -89,6 +89,10 @@ async def test_usage_response_includes_rate_limit_header_names(monkeypatch: pyte
             checked_at=utcnow(),
             reachable=True,
         )
+        result.sample_model = "claude-sonnet-4-6"
+        result.sample_input_tokens = 9
+        result.sample_output_tokens = 1
+        result.sample_latency_ms = 123
         result.raw_headers = {
             "anthropic-ratelimit-requests-limit": "1000",
             "anthropic-ratelimit-requests-remaining": "999",
@@ -117,6 +121,10 @@ async def test_usage_response_includes_rate_limit_header_names(monkeypatch: pyte
         data = response.json()
         assert data["provider"] == "anthropic"
         assert data["reachable"] is True
+        assert data["sample_model"] == "claude-sonnet-4-6"
+        assert data["sample_input_tokens"] == 9
+        assert data["sample_output_tokens"] == 1
+        assert data["sample_latency_ms"] == 123
         assert data["debug_rate_limit_headers"] == [
             "anthropic-ratelimit-requests-limit",
             "anthropic-ratelimit-requests-remaining",
@@ -150,6 +158,10 @@ async def test_test_endpoint_returns_live_result(monkeypatch: pytest.MonkeyPatch
             reachable=True,
         )
         result.models = ["claude-sonnet-4-6"]
+        result.sample_model = "claude-sonnet-4-6"
+        result.sample_input_tokens = 8
+        result.sample_output_tokens = 1
+        result.sample_latency_ms = 111
         result.raw_headers = {
             "anthropic-ratelimit-tokens-limit": "100000",
         }
@@ -184,6 +196,10 @@ async def test_test_endpoint_returns_live_result(monkeypatch: pytest.MonkeyPatch
         assert data["account_key"] == "Claude"
         assert data["reachable"] is True
         assert data["models"] == ["claude-sonnet-4-6"]
+        assert data["sample_model"] == "claude-sonnet-4-6"
+        assert data["sample_input_tokens"] == 8
+        assert data["sample_output_tokens"] == 1
+        assert data["sample_latency_ms"] == 111
         assert data["debug_rate_limit_headers"] == ["anthropic-ratelimit-tokens-limit"]
     finally:
         await engine.dispose()
diff --git a/frontend/src/api/generated/model/providerUsageLiveRead.ts b/frontend/src/api/generated/model/providerUsageLiveRead.ts
index 23c2a4c..1bd1967 100644
--- a/frontend/src/api/generated/model/providerUsageLiveRead.ts
+++ b/frontend/src/api/generated/model/providerUsageLiveRead.ts
@@ -20,4 +20,9 @@ export interface ProviderUsageLiveRead {
   input_tokens: TokenWindowRead;
   requests: RequestWindowRead;
   models?: string[];
+  sample_model?: string | null;
+  sample_input_tokens?: number | null;
+  sample_output_tokens?: number | null;
+  sample_latency_ms?: number | null;
+  debug_rate_limit_headers?: string[] | null;
 }
diff --git a/frontend/src/app/settings/ai-providers/page.tsx b/frontend/src/app/settings/ai-providers/page.tsx
index e3d6673..e8874c2 100644
--- a/frontend/src/app/settings/ai-providers/page.tsx
+++ b/frontend/src/app/settings/ai-providers/page.tsx
@@ -236,8 +236,17 @@ function CredentialForm({
               {testResult.reachable ? "Connection successful" : "Connection failed"}
             </p>
             <p className="mt-1 text-muted">
-              {testResult.error ?? `${testResult.models?.length ?? 0} model${(testResult.models?.length ?? 0) === 1 ? "" : "s"} returned`}
+              {testResult.error ?? (
+                testResult.sample_input_tokens != null || testResult.sample_output_tokens != null
+                  ? `Usage probe: in ${fmtTokens(testResult.sample_input_tokens)} · out ${fmtTokens(testResult.sample_output_tokens)}`
+                  : "Connected."
+              )}
             </p>
+            {testResult.sample_latency_ms != null && (
+              <p className="mt-1 text-muted">
+                Probe time: {fmtLatencyMs(testResult.sample_latency_ms)}
+              </p>
+            )}
 
           </div>
         )}
@@ -267,6 +276,12 @@ function fmtResetMs(ms: number | null | undefined): string {
   return `${h}h ${m % 60}m`;
 }
 
+function fmtLatencyMs(ms: number | null | undefined): string {
+  if (ms == null || ms < 0) return "—";
+  if (ms < 1000) return `${ms}ms`;
+  return `${(ms / 1000).toFixed(2)}s`;
+}
+
 function UsageStrip({ credentialId, provider }: { credentialId: string; provider: string }) {
   const [usage, setUsage] = useState<ProviderUsageLiveRead | null>(null);
   const [loading, setLoading] = useState(true);
@@ -318,30 +333,58 @@ function UsageStrip({ credentialId, provider }: { credentialId: string; provider
   const inputTok = usage.input_tokens;
   const req = usage.requests;
   const isOllama = provider === "ollama";
-  const modelCount = usage.models?.length ?? 0;
 
   return (
     <div className="mt-2 rounded-lg border border-[color:var(--border)] bg-[color:var(--surface)] p-2.5">
       {isOllama ? (
-        <div className="flex items-center gap-3 text-xs text-muted">
-          <span className="flex items-center gap-1 text-[color:var(--success)]">
-            <span className="inline-block h-1.5 w-1.5 rounded-full bg-[color:var(--success)]" />
-            Connected
-          </span>
-          {(usage.models?.length ?? 0) > 0 && (
-            <span>{usage.models!.length} model{usage.models!.length !== 1 ? "s" : ""} available</span>
+        <div className="space-y-1.5">
+          <div className="flex items-center gap-3 text-xs text-muted">
+            <span className="flex items-center gap-1 text-[color:var(--success)]">
+              <span className="inline-block h-1.5 w-1.5 rounded-full bg-[color:var(--success)]" />
+              Connected
+            </span>
+            {(usage.models?.length ?? 0) > 0 && (
+              <span>{usage.models!.length} model{usage.models!.length !== 1 ? "s" : ""} available</span>
+            )}
+            <button type="button" onClick={() => fetch(true)} className="ml-auto text-muted hover:text-strong">
+              <RefreshCw className={`h-3 w-3 ${loading ? "animate-spin" : ""}`} />
+            </button>
+          </div>
+          {(usage.sample_input_tokens != null || usage.sample_output_tokens != null) && (
+            <div className="flex items-center justify-between text-[11px] text-muted">
+              <span>Usage (last probe)</span>
+              <span className="tabular-nums text-strong">
+                in {fmtTokens(usage.sample_input_tokens)} · out {fmtTokens(usage.sample_output_tokens)}
+              </span>
+            </div>
           )}
-          <button type="button" onClick={() => fetch(true)} className="ml-auto text-muted hover:text-strong">
-            <RefreshCw className={`h-3 w-3 ${loading ? "animate-spin" : ""}`} />
-          </button>
+          {usage.sample_latency_ms != null && (
+            <div className="flex items-center justify-between text-[11px] text-muted">
+              <span>Time (last probe)</span>
+              <span className="tabular-nums text-strong">
+                {fmtLatencyMs(usage.sample_latency_ms)}
+              </span>
+            </div>
+          )}
+          {usage.sample_latency_ms != null && (
+            <div className="flex items-center justify-between text-[11px] text-muted">
+              <span>Time (last probe)</span>
+              <span className="tabular-nums text-strong">
+                {fmtLatencyMs(usage.sample_latency_ms)}
+              </span>
+            </div>
+          )}
+          <div className="flex items-center justify-between text-[11px] text-muted">
+            {lastFetched && <span>Updated {Math.round((Date.now() - lastFetched.getTime()) / 1000)}s ago</span>}
+          </div>
         </div>
       ) : (
         <div className="space-y-1.5">
-          {modelCount > 0 && (
+          {(usage.sample_input_tokens != null || usage.sample_output_tokens != null) && (
             <div className="flex items-center justify-between text-[11px] text-muted">
-              <span>Models</span>
+              <span>Usage (last probe)</span>
               <span className="tabular-nums text-strong">
-                {modelCount} available
+                in {fmtTokens(usage.sample_input_tokens)} · out {fmtTokens(usage.sample_output_tokens)}
               </span>
             </div>
           )}
@@ -415,7 +458,7 @@ function UsageStrip({ credentialId, provider }: { credentialId: string; provider
 
           {tok.limit == null && inputTok.limit == null && req.limit == null && (
             <p className="text-[11px] text-muted">
-              Connected — provider did not return token/request limit headers for this key tier.
+              Connected — no token/request limit windows were returned for this key right now.
             </p>
           )}