fix(scripts): codex/ollama fallback

This commit is contained in:
null 2026-05-20 23:49:56 -05:00
parent 9650283367
commit fa5a18bccd
2 changed files with 172 additions and 59 deletions

View File

@ -1,7 +1,8 @@
"""Live provider usage — fetch real token limits and reset times directly from provider APIs.
"""Live provider usage — fetch token limits and reset times from provider APIs.
Each provider exposes rate-limit headers on every API response. Pipeline calls
a lightweight, zero-token-cost endpoint (model list) and reads those headers.
Some providers expose rate-limit headers on regular API responses. Pipeline first
calls a lightweight endpoint (model list) and then, when needed, performs a
minimal generation probe to surface usage and latency details.
No guessing, no JSONL scanning, no estimates. If the provider API is
unreachable or the key is invalid, an error is returned and all limit fields
@ -21,7 +22,8 @@ openai → GET https://api.openai.com/v1/models
x-ratelimit-reset-tokens, x-ratelimit-limit-requests,
x-ratelimit-remaining-requests, x-ratelimit-reset-requests
Fallback probe (only when headers missing):
POST /v1/chat/completions with max_tokens=1 to surface usage+time.
POST /v1/responses with max_output_tokens=1 (preferred),
then /v1/chat/completions with max_tokens=1 (compatibility).
ollama GET {base_url}/api/tags (health-check only; no rate limits)
Returns: model list, server reachable flag
@ -215,7 +217,20 @@ def _pick_anthropic_probe_model(models: list[str]) -> str | None:
def _pick_openai_probe_model(models: list[str]) -> str | None:
if not models:
return None
priorities = ("gpt-4.1-mini", "gpt-4o-mini", "gpt-4.1", "gpt-4o", "o4-mini")
priorities = (
"gpt-5.5",
"gpt-5.4",
"gpt-5.3-codex",
"gpt-5.2-codex",
"gpt-5.1-codex",
"gpt-5-codex",
"codex",
"gpt-4.1-mini",
"gpt-4o-mini",
"gpt-4.1",
"gpt-4o",
"o4-mini",
)
lowered = [(m, m.lower()) for m in models]
for priority in priorities:
for original, lowered_name in lowered:
@ -224,6 +239,16 @@ def _pick_openai_probe_model(models: list[str]) -> str | None:
return models[0]
def _normalize_base(base_url: str | None, default_base: str, *, strip_suffixes: tuple[str, ...]) -> str:
base = (base_url or default_base).strip().rstrip("/")
lowered = base.lower()
for suffix in strip_suffixes:
if lowered.endswith(suffix):
base = base[: -len(suffix)]
break
return base.rstrip("/")
def _parse_openai_reset(value: str) -> datetime | None:
"""Parse an OpenAI reset header: ISO datetime OR duration like '1m30s'."""
if not value:
@ -242,12 +267,54 @@ def _parse_openai_reset(value: str) -> datetime | None:
return None
def _apply_openai_ratelimit_headers(result: ProviderUsageLive, headers: dict[str, str]) -> None:
result.tokens = TokenWindow(
limit=_parse_int_header(headers, "x-ratelimit-limit-tokens"),
remaining=_parse_int_header(headers, "x-ratelimit-remaining-tokens"),
reset_at=_parse_openai_reset(headers.get("x-ratelimit-reset-tokens", "")),
)
result.requests = RequestWindow(
limit=_parse_int_header(headers, "x-ratelimit-limit-requests"),
remaining=_parse_int_header(headers, "x-ratelimit-remaining-requests"),
reset_at=_parse_openai_reset(headers.get("x-ratelimit-reset-requests", "")),
)
def _extract_openai_usage(payload: Any) -> tuple[int | None, int | None]:
if not isinstance(payload, dict):
return (None, None)
usage = payload.get("usage")
if not isinstance(usage, dict):
return (None, None)
# Responses API style
in_tok = usage.get("input_tokens")
out_tok = usage.get("output_tokens")
if isinstance(in_tok, int) or isinstance(out_tok, int):
return (
in_tok if isinstance(in_tok, int) else None,
out_tok if isinstance(out_tok, int) else None,
)
# Chat Completions style
in_tok = usage.get("prompt_tokens")
out_tok = usage.get("completion_tokens")
return (
in_tok if isinstance(in_tok, int) else None,
out_tok if isinstance(out_tok, int) else None,
)
# ---------------------------------------------------------------------------
# Provider-specific fetch functions
# ---------------------------------------------------------------------------
async def _fetch_anthropic(api_key: str, base_url: str | None) -> ProviderUsageLive:
base = (base_url or "https://api.anthropic.com").rstrip("/")
base = _normalize_base(
base_url,
"https://api.anthropic.com",
strip_suffixes=("/v1",),
)
now = utcnow()
result = ProviderUsageLive(provider="anthropic", account_key="", checked_at=now, reachable=False)
@ -344,7 +411,11 @@ async def _fetch_anthropic(api_key: str, base_url: str | None) -> ProviderUsageL
async def _fetch_openai(api_key: str, base_url: str | None) -> ProviderUsageLive:
base = (base_url or "https://api.openai.com").rstrip("/")
base = _normalize_base(
base_url,
"https://api.openai.com",
strip_suffixes=("/v1",),
)
now = utcnow()
result = ProviderUsageLive(provider="openai", account_key="", checked_at=now, reachable=False)
@ -373,16 +444,7 @@ async def _fetch_openai(api_key: str, base_url: str | None) -> ProviderUsageLive
result.reachable = True
result.raw_headers = {k: v for k, v in h.items() if "ratelimit" in k}
result.tokens = TokenWindow(
limit = _parse_int_header(h, "x-ratelimit-limit-tokens"),
remaining = _parse_int_header(h, "x-ratelimit-remaining-tokens"),
reset_at = _parse_openai_reset(h.get("x-ratelimit-reset-tokens", "")),
)
result.requests = RequestWindow(
limit = _parse_int_header(h, "x-ratelimit-limit-requests"),
remaining = _parse_int_header(h, "x-ratelimit-remaining-requests"),
reset_at = _parse_openai_reset(h.get("x-ratelimit-reset-requests", "")),
)
_apply_openai_ratelimit_headers(result, h)
try:
data = resp.json()
@ -395,64 +457,71 @@ async def _fetch_openai(api_key: str, base_url: str | None) -> ProviderUsageLive
probe_model = _pick_openai_probe_model(result.models)
if probe_model:
result.sample_model = probe_model
async with httpx.AsyncClient(timeout=REQUEST_TIMEOUT) as client:
try:
probe_resp = await client.post(
f"{base}/v1/chat/completions",
headers={
"Authorization": f"Bearer {api_key}",
"content-type": "application/json",
},
json={
"model": probe_model,
"messages": [{"role": "user", "content": "Usage probe"}],
"max_tokens": 1,
},
)
except Exception:
probe_resp = None
if probe_resp is not None:
probe_endpoints: list[tuple[str, dict[str, Any]]] = [
(
f"{base}/v1/responses",
{
"model": probe_model,
"input": "Usage probe",
"max_output_tokens": 1,
},
),
(
f"{base}/v1/chat/completions",
{
"model": probe_model,
"messages": [{"role": "user", "content": "Usage probe"}],
"max_tokens": 1,
},
),
]
for endpoint, body in probe_endpoints:
async with httpx.AsyncClient(timeout=REQUEST_TIMEOUT) as client:
try:
probe_resp = await client.post(
endpoint,
headers={
"Authorization": f"Bearer {api_key}",
"content-type": "application/json",
},
json=body,
)
except Exception:
continue
probe_headers = {k.lower(): v for k, v in probe_resp.headers.items()}
probe_rl_headers = {k: v for k, v in probe_headers.items() if "ratelimit" in k}
if probe_rl_headers:
result.raw_headers = probe_rl_headers
result.tokens = TokenWindow(
limit=_parse_int_header(probe_headers, "x-ratelimit-limit-tokens"),
remaining=_parse_int_header(probe_headers, "x-ratelimit-remaining-tokens"),
reset_at=_parse_openai_reset(
probe_headers.get("x-ratelimit-reset-tokens", "")
),
)
result.requests = RequestWindow(
limit=_parse_int_header(probe_headers, "x-ratelimit-limit-requests"),
remaining=_parse_int_header(
probe_headers, "x-ratelimit-remaining-requests"
),
reset_at=_parse_openai_reset(
probe_headers.get("x-ratelimit-reset-requests", "")
),
)
_apply_openai_ratelimit_headers(result, probe_headers)
if probe_resp.status_code == 200:
try:
payload = probe_resp.json()
usage = payload.get("usage") if isinstance(payload, dict) else None
if isinstance(usage, dict):
in_tok = usage.get("prompt_tokens")
out_tok = usage.get("completion_tokens")
if isinstance(in_tok, int):
result.sample_input_tokens = in_tok
if isinstance(out_tok, int):
result.sample_output_tokens = out_tok
in_tok, out_tok = _extract_openai_usage(payload)
if in_tok is not None:
result.sample_input_tokens = in_tok
if out_tok is not None:
result.sample_output_tokens = out_tok
except Exception:
pass
elapsed_ms = probe_resp.elapsed.total_seconds() * 1000.0
result.sample_latency_ms = int(max(0.0, round(elapsed_ms)))
if (
result.tokens.limit is not None
or result.requests.limit is not None
or result.sample_input_tokens is not None
or result.sample_output_tokens is not None
):
break
return result
async def _fetch_ollama(base_url: str | None, api_key: str | None) -> ProviderUsageLive:
base = (base_url or "http://localhost:11434").rstrip("/")
base = _normalize_base(
base_url,
"http://localhost:11434",
strip_suffixes=("/api",),
)
now = utcnow()
result = ProviderUsageLive(provider="ollama", account_key="", checked_at=now, reachable=False)

View File

@ -0,0 +1,44 @@
# ruff: noqa: INP001
"""Unit tests for provider usage parsing and normalization helpers."""
from __future__ import annotations
from app.services.provider_usage import (
_extract_openai_usage,
_normalize_base,
_parse_openai_reset,
)
def test_normalize_base_strips_known_suffixes() -> None:
assert (
_normalize_base(
"https://api.openai.com/v1/",
"https://api.openai.com",
strip_suffixes=("/v1",),
)
== "https://api.openai.com"
)
assert (
_normalize_base(
"https://ollama.com/api/",
"http://localhost:11434",
strip_suffixes=("/api",),
)
== "https://ollama.com"
)
def test_extract_openai_usage_supports_responses_shape() -> None:
payload = {"usage": {"input_tokens": 12, "output_tokens": 3}}
assert _extract_openai_usage(payload) == (12, 3)
def test_extract_openai_usage_supports_chat_completions_shape() -> None:
payload = {"usage": {"prompt_tokens": 9, "completion_tokens": 1}}
assert _extract_openai_usage(payload) == (9, 1)
def test_parse_openai_reset_duration_format() -> None:
reset_at = _parse_openai_reset("6m0s")
assert reset_at is not None