fix(scripts): codex/ollama fallback
This commit is contained in:
parent
9650283367
commit
fa5a18bccd
|
|
@ -1,7 +1,8 @@
|
||||||
"""Live provider usage — fetch real token limits and reset times directly from provider APIs.
|
"""Live provider usage — fetch token limits and reset times from provider APIs.
|
||||||
|
|
||||||
Each provider exposes rate-limit headers on every API response. Pipeline calls
|
Some providers expose rate-limit headers on regular API responses. Pipeline first
|
||||||
a lightweight, zero-token-cost endpoint (model list) and reads those headers.
|
calls a lightweight endpoint (model list) and then, when needed, performs a
|
||||||
|
minimal generation probe to surface usage and latency details.
|
||||||
|
|
||||||
No guessing, no JSONL scanning, no estimates. If the provider API is
|
No guessing, no JSONL scanning, no estimates. If the provider API is
|
||||||
unreachable or the key is invalid, an error is returned and all limit fields
|
unreachable or the key is invalid, an error is returned and all limit fields
|
||||||
|
|
@ -21,7 +22,8 @@ openai → GET https://api.openai.com/v1/models
|
||||||
x-ratelimit-reset-tokens, x-ratelimit-limit-requests,
|
x-ratelimit-reset-tokens, x-ratelimit-limit-requests,
|
||||||
x-ratelimit-remaining-requests, x-ratelimit-reset-requests
|
x-ratelimit-remaining-requests, x-ratelimit-reset-requests
|
||||||
Fallback probe (only when headers missing):
|
Fallback probe (only when headers missing):
|
||||||
POST /v1/chat/completions with max_tokens=1 to surface usage+time.
|
POST /v1/responses with max_output_tokens=1 (preferred),
|
||||||
|
then /v1/chat/completions with max_tokens=1 (compatibility).
|
||||||
|
|
||||||
ollama → GET {base_url}/api/tags (health-check only; no rate limits)
|
ollama → GET {base_url}/api/tags (health-check only; no rate limits)
|
||||||
Returns: model list, server reachable flag
|
Returns: model list, server reachable flag
|
||||||
|
|
@ -215,7 +217,20 @@ def _pick_anthropic_probe_model(models: list[str]) -> str | None:
|
||||||
def _pick_openai_probe_model(models: list[str]) -> str | None:
|
def _pick_openai_probe_model(models: list[str]) -> str | None:
|
||||||
if not models:
|
if not models:
|
||||||
return None
|
return None
|
||||||
priorities = ("gpt-4.1-mini", "gpt-4o-mini", "gpt-4.1", "gpt-4o", "o4-mini")
|
priorities = (
|
||||||
|
"gpt-5.5",
|
||||||
|
"gpt-5.4",
|
||||||
|
"gpt-5.3-codex",
|
||||||
|
"gpt-5.2-codex",
|
||||||
|
"gpt-5.1-codex",
|
||||||
|
"gpt-5-codex",
|
||||||
|
"codex",
|
||||||
|
"gpt-4.1-mini",
|
||||||
|
"gpt-4o-mini",
|
||||||
|
"gpt-4.1",
|
||||||
|
"gpt-4o",
|
||||||
|
"o4-mini",
|
||||||
|
)
|
||||||
lowered = [(m, m.lower()) for m in models]
|
lowered = [(m, m.lower()) for m in models]
|
||||||
for priority in priorities:
|
for priority in priorities:
|
||||||
for original, lowered_name in lowered:
|
for original, lowered_name in lowered:
|
||||||
|
|
@ -224,6 +239,16 @@ def _pick_openai_probe_model(models: list[str]) -> str | None:
|
||||||
return models[0]
|
return models[0]
|
||||||
|
|
||||||
|
|
||||||
|
def _normalize_base(base_url: str | None, default_base: str, *, strip_suffixes: tuple[str, ...]) -> str:
|
||||||
|
base = (base_url or default_base).strip().rstrip("/")
|
||||||
|
lowered = base.lower()
|
||||||
|
for suffix in strip_suffixes:
|
||||||
|
if lowered.endswith(suffix):
|
||||||
|
base = base[: -len(suffix)]
|
||||||
|
break
|
||||||
|
return base.rstrip("/")
|
||||||
|
|
||||||
|
|
||||||
def _parse_openai_reset(value: str) -> datetime | None:
|
def _parse_openai_reset(value: str) -> datetime | None:
|
||||||
"""Parse an OpenAI reset header: ISO datetime OR duration like '1m30s'."""
|
"""Parse an OpenAI reset header: ISO datetime OR duration like '1m30s'."""
|
||||||
if not value:
|
if not value:
|
||||||
|
|
@ -242,12 +267,54 @@ def _parse_openai_reset(value: str) -> datetime | None:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _apply_openai_ratelimit_headers(result: ProviderUsageLive, headers: dict[str, str]) -> None:
|
||||||
|
result.tokens = TokenWindow(
|
||||||
|
limit=_parse_int_header(headers, "x-ratelimit-limit-tokens"),
|
||||||
|
remaining=_parse_int_header(headers, "x-ratelimit-remaining-tokens"),
|
||||||
|
reset_at=_parse_openai_reset(headers.get("x-ratelimit-reset-tokens", "")),
|
||||||
|
)
|
||||||
|
result.requests = RequestWindow(
|
||||||
|
limit=_parse_int_header(headers, "x-ratelimit-limit-requests"),
|
||||||
|
remaining=_parse_int_header(headers, "x-ratelimit-remaining-requests"),
|
||||||
|
reset_at=_parse_openai_reset(headers.get("x-ratelimit-reset-requests", "")),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_openai_usage(payload: Any) -> tuple[int | None, int | None]:
|
||||||
|
if not isinstance(payload, dict):
|
||||||
|
return (None, None)
|
||||||
|
usage = payload.get("usage")
|
||||||
|
if not isinstance(usage, dict):
|
||||||
|
return (None, None)
|
||||||
|
|
||||||
|
# Responses API style
|
||||||
|
in_tok = usage.get("input_tokens")
|
||||||
|
out_tok = usage.get("output_tokens")
|
||||||
|
if isinstance(in_tok, int) or isinstance(out_tok, int):
|
||||||
|
return (
|
||||||
|
in_tok if isinstance(in_tok, int) else None,
|
||||||
|
out_tok if isinstance(out_tok, int) else None,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Chat Completions style
|
||||||
|
in_tok = usage.get("prompt_tokens")
|
||||||
|
out_tok = usage.get("completion_tokens")
|
||||||
|
return (
|
||||||
|
in_tok if isinstance(in_tok, int) else None,
|
||||||
|
out_tok if isinstance(out_tok, int) else None,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Provider-specific fetch functions
|
# Provider-specific fetch functions
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
async def _fetch_anthropic(api_key: str, base_url: str | None) -> ProviderUsageLive:
|
async def _fetch_anthropic(api_key: str, base_url: str | None) -> ProviderUsageLive:
|
||||||
base = (base_url or "https://api.anthropic.com").rstrip("/")
|
base = _normalize_base(
|
||||||
|
base_url,
|
||||||
|
"https://api.anthropic.com",
|
||||||
|
strip_suffixes=("/v1",),
|
||||||
|
)
|
||||||
now = utcnow()
|
now = utcnow()
|
||||||
result = ProviderUsageLive(provider="anthropic", account_key="", checked_at=now, reachable=False)
|
result = ProviderUsageLive(provider="anthropic", account_key="", checked_at=now, reachable=False)
|
||||||
|
|
||||||
|
|
@ -344,7 +411,11 @@ async def _fetch_anthropic(api_key: str, base_url: str | None) -> ProviderUsageL
|
||||||
|
|
||||||
|
|
||||||
async def _fetch_openai(api_key: str, base_url: str | None) -> ProviderUsageLive:
|
async def _fetch_openai(api_key: str, base_url: str | None) -> ProviderUsageLive:
|
||||||
base = (base_url or "https://api.openai.com").rstrip("/")
|
base = _normalize_base(
|
||||||
|
base_url,
|
||||||
|
"https://api.openai.com",
|
||||||
|
strip_suffixes=("/v1",),
|
||||||
|
)
|
||||||
now = utcnow()
|
now = utcnow()
|
||||||
result = ProviderUsageLive(provider="openai", account_key="", checked_at=now, reachable=False)
|
result = ProviderUsageLive(provider="openai", account_key="", checked_at=now, reachable=False)
|
||||||
|
|
||||||
|
|
@ -373,16 +444,7 @@ async def _fetch_openai(api_key: str, base_url: str | None) -> ProviderUsageLive
|
||||||
result.reachable = True
|
result.reachable = True
|
||||||
result.raw_headers = {k: v for k, v in h.items() if "ratelimit" in k}
|
result.raw_headers = {k: v for k, v in h.items() if "ratelimit" in k}
|
||||||
|
|
||||||
result.tokens = TokenWindow(
|
_apply_openai_ratelimit_headers(result, h)
|
||||||
limit = _parse_int_header(h, "x-ratelimit-limit-tokens"),
|
|
||||||
remaining = _parse_int_header(h, "x-ratelimit-remaining-tokens"),
|
|
||||||
reset_at = _parse_openai_reset(h.get("x-ratelimit-reset-tokens", "")),
|
|
||||||
)
|
|
||||||
result.requests = RequestWindow(
|
|
||||||
limit = _parse_int_header(h, "x-ratelimit-limit-requests"),
|
|
||||||
remaining = _parse_int_header(h, "x-ratelimit-remaining-requests"),
|
|
||||||
reset_at = _parse_openai_reset(h.get("x-ratelimit-reset-requests", "")),
|
|
||||||
)
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
data = resp.json()
|
data = resp.json()
|
||||||
|
|
@ -395,64 +457,71 @@ async def _fetch_openai(api_key: str, base_url: str | None) -> ProviderUsageLive
|
||||||
probe_model = _pick_openai_probe_model(result.models)
|
probe_model = _pick_openai_probe_model(result.models)
|
||||||
if probe_model:
|
if probe_model:
|
||||||
result.sample_model = probe_model
|
result.sample_model = probe_model
|
||||||
async with httpx.AsyncClient(timeout=REQUEST_TIMEOUT) as client:
|
probe_endpoints: list[tuple[str, dict[str, Any]]] = [
|
||||||
try:
|
(
|
||||||
probe_resp = await client.post(
|
f"{base}/v1/responses",
|
||||||
f"{base}/v1/chat/completions",
|
{
|
||||||
headers={
|
"model": probe_model,
|
||||||
"Authorization": f"Bearer {api_key}",
|
"input": "Usage probe",
|
||||||
"content-type": "application/json",
|
"max_output_tokens": 1,
|
||||||
},
|
},
|
||||||
json={
|
),
|
||||||
"model": probe_model,
|
(
|
||||||
"messages": [{"role": "user", "content": "Usage probe"}],
|
f"{base}/v1/chat/completions",
|
||||||
"max_tokens": 1,
|
{
|
||||||
},
|
"model": probe_model,
|
||||||
)
|
"messages": [{"role": "user", "content": "Usage probe"}],
|
||||||
except Exception:
|
"max_tokens": 1,
|
||||||
probe_resp = None
|
},
|
||||||
if probe_resp is not None:
|
),
|
||||||
|
]
|
||||||
|
for endpoint, body in probe_endpoints:
|
||||||
|
async with httpx.AsyncClient(timeout=REQUEST_TIMEOUT) as client:
|
||||||
|
try:
|
||||||
|
probe_resp = await client.post(
|
||||||
|
endpoint,
|
||||||
|
headers={
|
||||||
|
"Authorization": f"Bearer {api_key}",
|
||||||
|
"content-type": "application/json",
|
||||||
|
},
|
||||||
|
json=body,
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
probe_headers = {k.lower(): v for k, v in probe_resp.headers.items()}
|
probe_headers = {k.lower(): v for k, v in probe_resp.headers.items()}
|
||||||
probe_rl_headers = {k: v for k, v in probe_headers.items() if "ratelimit" in k}
|
probe_rl_headers = {k: v for k, v in probe_headers.items() if "ratelimit" in k}
|
||||||
if probe_rl_headers:
|
if probe_rl_headers:
|
||||||
result.raw_headers = probe_rl_headers
|
result.raw_headers = probe_rl_headers
|
||||||
result.tokens = TokenWindow(
|
_apply_openai_ratelimit_headers(result, probe_headers)
|
||||||
limit=_parse_int_header(probe_headers, "x-ratelimit-limit-tokens"),
|
|
||||||
remaining=_parse_int_header(probe_headers, "x-ratelimit-remaining-tokens"),
|
|
||||||
reset_at=_parse_openai_reset(
|
|
||||||
probe_headers.get("x-ratelimit-reset-tokens", "")
|
|
||||||
),
|
|
||||||
)
|
|
||||||
result.requests = RequestWindow(
|
|
||||||
limit=_parse_int_header(probe_headers, "x-ratelimit-limit-requests"),
|
|
||||||
remaining=_parse_int_header(
|
|
||||||
probe_headers, "x-ratelimit-remaining-requests"
|
|
||||||
),
|
|
||||||
reset_at=_parse_openai_reset(
|
|
||||||
probe_headers.get("x-ratelimit-reset-requests", "")
|
|
||||||
),
|
|
||||||
)
|
|
||||||
if probe_resp.status_code == 200:
|
if probe_resp.status_code == 200:
|
||||||
try:
|
try:
|
||||||
payload = probe_resp.json()
|
payload = probe_resp.json()
|
||||||
usage = payload.get("usage") if isinstance(payload, dict) else None
|
in_tok, out_tok = _extract_openai_usage(payload)
|
||||||
if isinstance(usage, dict):
|
if in_tok is not None:
|
||||||
in_tok = usage.get("prompt_tokens")
|
result.sample_input_tokens = in_tok
|
||||||
out_tok = usage.get("completion_tokens")
|
if out_tok is not None:
|
||||||
if isinstance(in_tok, int):
|
result.sample_output_tokens = out_tok
|
||||||
result.sample_input_tokens = in_tok
|
|
||||||
if isinstance(out_tok, int):
|
|
||||||
result.sample_output_tokens = out_tok
|
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
elapsed_ms = probe_resp.elapsed.total_seconds() * 1000.0
|
elapsed_ms = probe_resp.elapsed.total_seconds() * 1000.0
|
||||||
result.sample_latency_ms = int(max(0.0, round(elapsed_ms)))
|
result.sample_latency_ms = int(max(0.0, round(elapsed_ms)))
|
||||||
|
if (
|
||||||
|
result.tokens.limit is not None
|
||||||
|
or result.requests.limit is not None
|
||||||
|
or result.sample_input_tokens is not None
|
||||||
|
or result.sample_output_tokens is not None
|
||||||
|
):
|
||||||
|
break
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
async def _fetch_ollama(base_url: str | None, api_key: str | None) -> ProviderUsageLive:
|
async def _fetch_ollama(base_url: str | None, api_key: str | None) -> ProviderUsageLive:
|
||||||
base = (base_url or "http://localhost:11434").rstrip("/")
|
base = _normalize_base(
|
||||||
|
base_url,
|
||||||
|
"http://localhost:11434",
|
||||||
|
strip_suffixes=("/api",),
|
||||||
|
)
|
||||||
now = utcnow()
|
now = utcnow()
|
||||||
result = ProviderUsageLive(provider="ollama", account_key="", checked_at=now, reachable=False)
|
result = ProviderUsageLive(provider="ollama", account_key="", checked_at=now, reachable=False)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,44 @@
|
||||||
|
# ruff: noqa: INP001
|
||||||
|
"""Unit tests for provider usage parsing and normalization helpers."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from app.services.provider_usage import (
|
||||||
|
_extract_openai_usage,
|
||||||
|
_normalize_base,
|
||||||
|
_parse_openai_reset,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_normalize_base_strips_known_suffixes() -> None:
|
||||||
|
assert (
|
||||||
|
_normalize_base(
|
||||||
|
"https://api.openai.com/v1/",
|
||||||
|
"https://api.openai.com",
|
||||||
|
strip_suffixes=("/v1",),
|
||||||
|
)
|
||||||
|
== "https://api.openai.com"
|
||||||
|
)
|
||||||
|
assert (
|
||||||
|
_normalize_base(
|
||||||
|
"https://ollama.com/api/",
|
||||||
|
"http://localhost:11434",
|
||||||
|
strip_suffixes=("/api",),
|
||||||
|
)
|
||||||
|
== "https://ollama.com"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_extract_openai_usage_supports_responses_shape() -> None:
|
||||||
|
payload = {"usage": {"input_tokens": 12, "output_tokens": 3}}
|
||||||
|
assert _extract_openai_usage(payload) == (12, 3)
|
||||||
|
|
||||||
|
|
||||||
|
def test_extract_openai_usage_supports_chat_completions_shape() -> None:
|
||||||
|
payload = {"usage": {"prompt_tokens": 9, "completion_tokens": 1}}
|
||||||
|
assert _extract_openai_usage(payload) == (9, 1)
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_openai_reset_duration_format() -> None:
|
||||||
|
reset_at = _parse_openai_reset("6m0s")
|
||||||
|
assert reset_at is not None
|
||||||
Loading…
Reference in New Issue