fix(scripts): codex/ollama fallback

This commit is contained in:
null 2026-05-20 23:49:56 -05:00
parent 9650283367
commit fa5a18bccd
2 changed files with 172 additions and 59 deletions

View File

@ -1,7 +1,8 @@
"""Live provider usage — fetch real token limits and reset times directly from provider APIs. """Live provider usage — fetch token limits and reset times from provider APIs.
Each provider exposes rate-limit headers on every API response. Pipeline calls Some providers expose rate-limit headers on regular API responses. Pipeline first
a lightweight, zero-token-cost endpoint (model list) and reads those headers. calls a lightweight endpoint (model list) and then, when needed, performs a
minimal generation probe to surface usage and latency details.
No guessing, no JSONL scanning, no estimates. If the provider API is No guessing, no JSONL scanning, no estimates. If the provider API is
unreachable or the key is invalid, an error is returned and all limit fields unreachable or the key is invalid, an error is returned and all limit fields
@ -21,7 +22,8 @@ openai → GET https://api.openai.com/v1/models
x-ratelimit-reset-tokens, x-ratelimit-limit-requests, x-ratelimit-reset-tokens, x-ratelimit-limit-requests,
x-ratelimit-remaining-requests, x-ratelimit-reset-requests x-ratelimit-remaining-requests, x-ratelimit-reset-requests
Fallback probe (only when headers missing): Fallback probe (only when headers missing):
POST /v1/chat/completions with max_tokens=1 to surface usage+time. POST /v1/responses with max_output_tokens=1 (preferred),
then /v1/chat/completions with max_tokens=1 (compatibility).
ollama GET {base_url}/api/tags (health-check only; no rate limits) ollama GET {base_url}/api/tags (health-check only; no rate limits)
Returns: model list, server reachable flag Returns: model list, server reachable flag
@ -215,7 +217,20 @@ def _pick_anthropic_probe_model(models: list[str]) -> str | None:
def _pick_openai_probe_model(models: list[str]) -> str | None: def _pick_openai_probe_model(models: list[str]) -> str | None:
if not models: if not models:
return None return None
priorities = ("gpt-4.1-mini", "gpt-4o-mini", "gpt-4.1", "gpt-4o", "o4-mini") priorities = (
"gpt-5.5",
"gpt-5.4",
"gpt-5.3-codex",
"gpt-5.2-codex",
"gpt-5.1-codex",
"gpt-5-codex",
"codex",
"gpt-4.1-mini",
"gpt-4o-mini",
"gpt-4.1",
"gpt-4o",
"o4-mini",
)
lowered = [(m, m.lower()) for m in models] lowered = [(m, m.lower()) for m in models]
for priority in priorities: for priority in priorities:
for original, lowered_name in lowered: for original, lowered_name in lowered:
@ -224,6 +239,16 @@ def _pick_openai_probe_model(models: list[str]) -> str | None:
return models[0] return models[0]
def _normalize_base(base_url: str | None, default_base: str, *, strip_suffixes: tuple[str, ...]) -> str:
base = (base_url or default_base).strip().rstrip("/")
lowered = base.lower()
for suffix in strip_suffixes:
if lowered.endswith(suffix):
base = base[: -len(suffix)]
break
return base.rstrip("/")
def _parse_openai_reset(value: str) -> datetime | None: def _parse_openai_reset(value: str) -> datetime | None:
"""Parse an OpenAI reset header: ISO datetime OR duration like '1m30s'.""" """Parse an OpenAI reset header: ISO datetime OR duration like '1m30s'."""
if not value: if not value:
@ -242,12 +267,54 @@ def _parse_openai_reset(value: str) -> datetime | None:
return None return None
def _apply_openai_ratelimit_headers(result: ProviderUsageLive, headers: dict[str, str]) -> None:
result.tokens = TokenWindow(
limit=_parse_int_header(headers, "x-ratelimit-limit-tokens"),
remaining=_parse_int_header(headers, "x-ratelimit-remaining-tokens"),
reset_at=_parse_openai_reset(headers.get("x-ratelimit-reset-tokens", "")),
)
result.requests = RequestWindow(
limit=_parse_int_header(headers, "x-ratelimit-limit-requests"),
remaining=_parse_int_header(headers, "x-ratelimit-remaining-requests"),
reset_at=_parse_openai_reset(headers.get("x-ratelimit-reset-requests", "")),
)
def _extract_openai_usage(payload: Any) -> tuple[int | None, int | None]:
if not isinstance(payload, dict):
return (None, None)
usage = payload.get("usage")
if not isinstance(usage, dict):
return (None, None)
# Responses API style
in_tok = usage.get("input_tokens")
out_tok = usage.get("output_tokens")
if isinstance(in_tok, int) or isinstance(out_tok, int):
return (
in_tok if isinstance(in_tok, int) else None,
out_tok if isinstance(out_tok, int) else None,
)
# Chat Completions style
in_tok = usage.get("prompt_tokens")
out_tok = usage.get("completion_tokens")
return (
in_tok if isinstance(in_tok, int) else None,
out_tok if isinstance(out_tok, int) else None,
)
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Provider-specific fetch functions # Provider-specific fetch functions
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
async def _fetch_anthropic(api_key: str, base_url: str | None) -> ProviderUsageLive: async def _fetch_anthropic(api_key: str, base_url: str | None) -> ProviderUsageLive:
base = (base_url or "https://api.anthropic.com").rstrip("/") base = _normalize_base(
base_url,
"https://api.anthropic.com",
strip_suffixes=("/v1",),
)
now = utcnow() now = utcnow()
result = ProviderUsageLive(provider="anthropic", account_key="", checked_at=now, reachable=False) result = ProviderUsageLive(provider="anthropic", account_key="", checked_at=now, reachable=False)
@ -344,7 +411,11 @@ async def _fetch_anthropic(api_key: str, base_url: str | None) -> ProviderUsageL
async def _fetch_openai(api_key: str, base_url: str | None) -> ProviderUsageLive: async def _fetch_openai(api_key: str, base_url: str | None) -> ProviderUsageLive:
base = (base_url or "https://api.openai.com").rstrip("/") base = _normalize_base(
base_url,
"https://api.openai.com",
strip_suffixes=("/v1",),
)
now = utcnow() now = utcnow()
result = ProviderUsageLive(provider="openai", account_key="", checked_at=now, reachable=False) result = ProviderUsageLive(provider="openai", account_key="", checked_at=now, reachable=False)
@ -373,16 +444,7 @@ async def _fetch_openai(api_key: str, base_url: str | None) -> ProviderUsageLive
result.reachable = True result.reachable = True
result.raw_headers = {k: v for k, v in h.items() if "ratelimit" in k} result.raw_headers = {k: v for k, v in h.items() if "ratelimit" in k}
result.tokens = TokenWindow( _apply_openai_ratelimit_headers(result, h)
limit = _parse_int_header(h, "x-ratelimit-limit-tokens"),
remaining = _parse_int_header(h, "x-ratelimit-remaining-tokens"),
reset_at = _parse_openai_reset(h.get("x-ratelimit-reset-tokens", "")),
)
result.requests = RequestWindow(
limit = _parse_int_header(h, "x-ratelimit-limit-requests"),
remaining = _parse_int_header(h, "x-ratelimit-remaining-requests"),
reset_at = _parse_openai_reset(h.get("x-ratelimit-reset-requests", "")),
)
try: try:
data = resp.json() data = resp.json()
@ -395,64 +457,71 @@ async def _fetch_openai(api_key: str, base_url: str | None) -> ProviderUsageLive
probe_model = _pick_openai_probe_model(result.models) probe_model = _pick_openai_probe_model(result.models)
if probe_model: if probe_model:
result.sample_model = probe_model result.sample_model = probe_model
async with httpx.AsyncClient(timeout=REQUEST_TIMEOUT) as client: probe_endpoints: list[tuple[str, dict[str, Any]]] = [
try: (
probe_resp = await client.post( f"{base}/v1/responses",
f"{base}/v1/chat/completions", {
headers={ "model": probe_model,
"Authorization": f"Bearer {api_key}", "input": "Usage probe",
"content-type": "application/json", "max_output_tokens": 1,
}, },
json={ ),
"model": probe_model, (
"messages": [{"role": "user", "content": "Usage probe"}], f"{base}/v1/chat/completions",
"max_tokens": 1, {
}, "model": probe_model,
) "messages": [{"role": "user", "content": "Usage probe"}],
except Exception: "max_tokens": 1,
probe_resp = None },
if probe_resp is not None: ),
]
for endpoint, body in probe_endpoints:
async with httpx.AsyncClient(timeout=REQUEST_TIMEOUT) as client:
try:
probe_resp = await client.post(
endpoint,
headers={
"Authorization": f"Bearer {api_key}",
"content-type": "application/json",
},
json=body,
)
except Exception:
continue
probe_headers = {k.lower(): v for k, v in probe_resp.headers.items()} probe_headers = {k.lower(): v for k, v in probe_resp.headers.items()}
probe_rl_headers = {k: v for k, v in probe_headers.items() if "ratelimit" in k} probe_rl_headers = {k: v for k, v in probe_headers.items() if "ratelimit" in k}
if probe_rl_headers: if probe_rl_headers:
result.raw_headers = probe_rl_headers result.raw_headers = probe_rl_headers
result.tokens = TokenWindow( _apply_openai_ratelimit_headers(result, probe_headers)
limit=_parse_int_header(probe_headers, "x-ratelimit-limit-tokens"),
remaining=_parse_int_header(probe_headers, "x-ratelimit-remaining-tokens"),
reset_at=_parse_openai_reset(
probe_headers.get("x-ratelimit-reset-tokens", "")
),
)
result.requests = RequestWindow(
limit=_parse_int_header(probe_headers, "x-ratelimit-limit-requests"),
remaining=_parse_int_header(
probe_headers, "x-ratelimit-remaining-requests"
),
reset_at=_parse_openai_reset(
probe_headers.get("x-ratelimit-reset-requests", "")
),
)
if probe_resp.status_code == 200: if probe_resp.status_code == 200:
try: try:
payload = probe_resp.json() payload = probe_resp.json()
usage = payload.get("usage") if isinstance(payload, dict) else None in_tok, out_tok = _extract_openai_usage(payload)
if isinstance(usage, dict): if in_tok is not None:
in_tok = usage.get("prompt_tokens") result.sample_input_tokens = in_tok
out_tok = usage.get("completion_tokens") if out_tok is not None:
if isinstance(in_tok, int): result.sample_output_tokens = out_tok
result.sample_input_tokens = in_tok
if isinstance(out_tok, int):
result.sample_output_tokens = out_tok
except Exception: except Exception:
pass pass
elapsed_ms = probe_resp.elapsed.total_seconds() * 1000.0 elapsed_ms = probe_resp.elapsed.total_seconds() * 1000.0
result.sample_latency_ms = int(max(0.0, round(elapsed_ms))) result.sample_latency_ms = int(max(0.0, round(elapsed_ms)))
if (
result.tokens.limit is not None
or result.requests.limit is not None
or result.sample_input_tokens is not None
or result.sample_output_tokens is not None
):
break
return result return result
async def _fetch_ollama(base_url: str | None, api_key: str | None) -> ProviderUsageLive: async def _fetch_ollama(base_url: str | None, api_key: str | None) -> ProviderUsageLive:
base = (base_url or "http://localhost:11434").rstrip("/") base = _normalize_base(
base_url,
"http://localhost:11434",
strip_suffixes=("/api",),
)
now = utcnow() now = utcnow()
result = ProviderUsageLive(provider="ollama", account_key="", checked_at=now, reachable=False) result = ProviderUsageLive(provider="ollama", account_key="", checked_at=now, reachable=False)

View File

@ -0,0 +1,44 @@
# ruff: noqa: INP001
"""Unit tests for provider usage parsing and normalization helpers."""
from __future__ import annotations
from app.services.provider_usage import (
_extract_openai_usage,
_normalize_base,
_parse_openai_reset,
)
def test_normalize_base_strips_known_suffixes() -> None:
assert (
_normalize_base(
"https://api.openai.com/v1/",
"https://api.openai.com",
strip_suffixes=("/v1",),
)
== "https://api.openai.com"
)
assert (
_normalize_base(
"https://ollama.com/api/",
"http://localhost:11434",
strip_suffixes=("/api",),
)
== "https://ollama.com"
)
def test_extract_openai_usage_supports_responses_shape() -> None:
payload = {"usage": {"input_tokens": 12, "output_tokens": 3}}
assert _extract_openai_usage(payload) == (12, 3)
def test_extract_openai_usage_supports_chat_completions_shape() -> None:
payload = {"usage": {"prompt_tokens": 9, "completion_tokens": 1}}
assert _extract_openai_usage(payload) == (9, 1)
def test_parse_openai_reset_duration_format() -> None:
reset_at = _parse_openai_reset("6m0s")
assert reset_at is not None