fix(scripts): codex/ollama fallback
This commit is contained in:
parent
9650283367
commit
fa5a18bccd
|
|
@ -1,7 +1,8 @@
|
|||
"""Live provider usage — fetch real token limits and reset times directly from provider APIs.
|
||||
"""Live provider usage — fetch token limits and reset times from provider APIs.
|
||||
|
||||
Each provider exposes rate-limit headers on every API response. Pipeline calls
|
||||
a lightweight, zero-token-cost endpoint (model list) and reads those headers.
|
||||
Some providers expose rate-limit headers on regular API responses. Pipeline first
|
||||
calls a lightweight endpoint (model list) and then, when needed, performs a
|
||||
minimal generation probe to surface usage and latency details.
|
||||
|
||||
No guessing, no JSONL scanning, no estimates. If the provider API is
|
||||
unreachable or the key is invalid, an error is returned and all limit fields
|
||||
|
|
@ -21,7 +22,8 @@ openai → GET https://api.openai.com/v1/models
|
|||
x-ratelimit-reset-tokens, x-ratelimit-limit-requests,
|
||||
x-ratelimit-remaining-requests, x-ratelimit-reset-requests
|
||||
Fallback probe (only when headers missing):
|
||||
POST /v1/chat/completions with max_tokens=1 to surface usage+time.
|
||||
POST /v1/responses with max_output_tokens=1 (preferred),
|
||||
then /v1/chat/completions with max_tokens=1 (compatibility).
|
||||
|
||||
ollama → GET {base_url}/api/tags (health-check only; no rate limits)
|
||||
Returns: model list, server reachable flag
|
||||
|
|
@ -215,7 +217,20 @@ def _pick_anthropic_probe_model(models: list[str]) -> str | None:
|
|||
def _pick_openai_probe_model(models: list[str]) -> str | None:
|
||||
if not models:
|
||||
return None
|
||||
priorities = ("gpt-4.1-mini", "gpt-4o-mini", "gpt-4.1", "gpt-4o", "o4-mini")
|
||||
priorities = (
|
||||
"gpt-5.5",
|
||||
"gpt-5.4",
|
||||
"gpt-5.3-codex",
|
||||
"gpt-5.2-codex",
|
||||
"gpt-5.1-codex",
|
||||
"gpt-5-codex",
|
||||
"codex",
|
||||
"gpt-4.1-mini",
|
||||
"gpt-4o-mini",
|
||||
"gpt-4.1",
|
||||
"gpt-4o",
|
||||
"o4-mini",
|
||||
)
|
||||
lowered = [(m, m.lower()) for m in models]
|
||||
for priority in priorities:
|
||||
for original, lowered_name in lowered:
|
||||
|
|
@ -224,6 +239,16 @@ def _pick_openai_probe_model(models: list[str]) -> str | None:
|
|||
return models[0]
|
||||
|
||||
|
||||
def _normalize_base(base_url: str | None, default_base: str, *, strip_suffixes: tuple[str, ...]) -> str:
|
||||
base = (base_url or default_base).strip().rstrip("/")
|
||||
lowered = base.lower()
|
||||
for suffix in strip_suffixes:
|
||||
if lowered.endswith(suffix):
|
||||
base = base[: -len(suffix)]
|
||||
break
|
||||
return base.rstrip("/")
|
||||
|
||||
|
||||
def _parse_openai_reset(value: str) -> datetime | None:
|
||||
"""Parse an OpenAI reset header: ISO datetime OR duration like '1m30s'."""
|
||||
if not value:
|
||||
|
|
@ -242,12 +267,54 @@ def _parse_openai_reset(value: str) -> datetime | None:
|
|||
return None
|
||||
|
||||
|
||||
def _apply_openai_ratelimit_headers(result: ProviderUsageLive, headers: dict[str, str]) -> None:
|
||||
result.tokens = TokenWindow(
|
||||
limit=_parse_int_header(headers, "x-ratelimit-limit-tokens"),
|
||||
remaining=_parse_int_header(headers, "x-ratelimit-remaining-tokens"),
|
||||
reset_at=_parse_openai_reset(headers.get("x-ratelimit-reset-tokens", "")),
|
||||
)
|
||||
result.requests = RequestWindow(
|
||||
limit=_parse_int_header(headers, "x-ratelimit-limit-requests"),
|
||||
remaining=_parse_int_header(headers, "x-ratelimit-remaining-requests"),
|
||||
reset_at=_parse_openai_reset(headers.get("x-ratelimit-reset-requests", "")),
|
||||
)
|
||||
|
||||
|
||||
def _extract_openai_usage(payload: Any) -> tuple[int | None, int | None]:
|
||||
if not isinstance(payload, dict):
|
||||
return (None, None)
|
||||
usage = payload.get("usage")
|
||||
if not isinstance(usage, dict):
|
||||
return (None, None)
|
||||
|
||||
# Responses API style
|
||||
in_tok = usage.get("input_tokens")
|
||||
out_tok = usage.get("output_tokens")
|
||||
if isinstance(in_tok, int) or isinstance(out_tok, int):
|
||||
return (
|
||||
in_tok if isinstance(in_tok, int) else None,
|
||||
out_tok if isinstance(out_tok, int) else None,
|
||||
)
|
||||
|
||||
# Chat Completions style
|
||||
in_tok = usage.get("prompt_tokens")
|
||||
out_tok = usage.get("completion_tokens")
|
||||
return (
|
||||
in_tok if isinstance(in_tok, int) else None,
|
||||
out_tok if isinstance(out_tok, int) else None,
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Provider-specific fetch functions
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
async def _fetch_anthropic(api_key: str, base_url: str | None) -> ProviderUsageLive:
|
||||
base = (base_url or "https://api.anthropic.com").rstrip("/")
|
||||
base = _normalize_base(
|
||||
base_url,
|
||||
"https://api.anthropic.com",
|
||||
strip_suffixes=("/v1",),
|
||||
)
|
||||
now = utcnow()
|
||||
result = ProviderUsageLive(provider="anthropic", account_key="", checked_at=now, reachable=False)
|
||||
|
||||
|
|
@ -344,7 +411,11 @@ async def _fetch_anthropic(api_key: str, base_url: str | None) -> ProviderUsageL
|
|||
|
||||
|
||||
async def _fetch_openai(api_key: str, base_url: str | None) -> ProviderUsageLive:
|
||||
base = (base_url or "https://api.openai.com").rstrip("/")
|
||||
base = _normalize_base(
|
||||
base_url,
|
||||
"https://api.openai.com",
|
||||
strip_suffixes=("/v1",),
|
||||
)
|
||||
now = utcnow()
|
||||
result = ProviderUsageLive(provider="openai", account_key="", checked_at=now, reachable=False)
|
||||
|
||||
|
|
@ -373,16 +444,7 @@ async def _fetch_openai(api_key: str, base_url: str | None) -> ProviderUsageLive
|
|||
result.reachable = True
|
||||
result.raw_headers = {k: v for k, v in h.items() if "ratelimit" in k}
|
||||
|
||||
result.tokens = TokenWindow(
|
||||
limit = _parse_int_header(h, "x-ratelimit-limit-tokens"),
|
||||
remaining = _parse_int_header(h, "x-ratelimit-remaining-tokens"),
|
||||
reset_at = _parse_openai_reset(h.get("x-ratelimit-reset-tokens", "")),
|
||||
)
|
||||
result.requests = RequestWindow(
|
||||
limit = _parse_int_header(h, "x-ratelimit-limit-requests"),
|
||||
remaining = _parse_int_header(h, "x-ratelimit-remaining-requests"),
|
||||
reset_at = _parse_openai_reset(h.get("x-ratelimit-reset-requests", "")),
|
||||
)
|
||||
_apply_openai_ratelimit_headers(result, h)
|
||||
|
||||
try:
|
||||
data = resp.json()
|
||||
|
|
@ -395,64 +457,71 @@ async def _fetch_openai(api_key: str, base_url: str | None) -> ProviderUsageLive
|
|||
probe_model = _pick_openai_probe_model(result.models)
|
||||
if probe_model:
|
||||
result.sample_model = probe_model
|
||||
async with httpx.AsyncClient(timeout=REQUEST_TIMEOUT) as client:
|
||||
try:
|
||||
probe_resp = await client.post(
|
||||
f"{base}/v1/chat/completions",
|
||||
headers={
|
||||
"Authorization": f"Bearer {api_key}",
|
||||
"content-type": "application/json",
|
||||
},
|
||||
json={
|
||||
"model": probe_model,
|
||||
"messages": [{"role": "user", "content": "Usage probe"}],
|
||||
"max_tokens": 1,
|
||||
},
|
||||
)
|
||||
except Exception:
|
||||
probe_resp = None
|
||||
if probe_resp is not None:
|
||||
probe_endpoints: list[tuple[str, dict[str, Any]]] = [
|
||||
(
|
||||
f"{base}/v1/responses",
|
||||
{
|
||||
"model": probe_model,
|
||||
"input": "Usage probe",
|
||||
"max_output_tokens": 1,
|
||||
},
|
||||
),
|
||||
(
|
||||
f"{base}/v1/chat/completions",
|
||||
{
|
||||
"model": probe_model,
|
||||
"messages": [{"role": "user", "content": "Usage probe"}],
|
||||
"max_tokens": 1,
|
||||
},
|
||||
),
|
||||
]
|
||||
for endpoint, body in probe_endpoints:
|
||||
async with httpx.AsyncClient(timeout=REQUEST_TIMEOUT) as client:
|
||||
try:
|
||||
probe_resp = await client.post(
|
||||
endpoint,
|
||||
headers={
|
||||
"Authorization": f"Bearer {api_key}",
|
||||
"content-type": "application/json",
|
||||
},
|
||||
json=body,
|
||||
)
|
||||
except Exception:
|
||||
continue
|
||||
probe_headers = {k.lower(): v for k, v in probe_resp.headers.items()}
|
||||
probe_rl_headers = {k: v for k, v in probe_headers.items() if "ratelimit" in k}
|
||||
if probe_rl_headers:
|
||||
result.raw_headers = probe_rl_headers
|
||||
result.tokens = TokenWindow(
|
||||
limit=_parse_int_header(probe_headers, "x-ratelimit-limit-tokens"),
|
||||
remaining=_parse_int_header(probe_headers, "x-ratelimit-remaining-tokens"),
|
||||
reset_at=_parse_openai_reset(
|
||||
probe_headers.get("x-ratelimit-reset-tokens", "")
|
||||
),
|
||||
)
|
||||
result.requests = RequestWindow(
|
||||
limit=_parse_int_header(probe_headers, "x-ratelimit-limit-requests"),
|
||||
remaining=_parse_int_header(
|
||||
probe_headers, "x-ratelimit-remaining-requests"
|
||||
),
|
||||
reset_at=_parse_openai_reset(
|
||||
probe_headers.get("x-ratelimit-reset-requests", "")
|
||||
),
|
||||
)
|
||||
_apply_openai_ratelimit_headers(result, probe_headers)
|
||||
if probe_resp.status_code == 200:
|
||||
try:
|
||||
payload = probe_resp.json()
|
||||
usage = payload.get("usage") if isinstance(payload, dict) else None
|
||||
if isinstance(usage, dict):
|
||||
in_tok = usage.get("prompt_tokens")
|
||||
out_tok = usage.get("completion_tokens")
|
||||
if isinstance(in_tok, int):
|
||||
result.sample_input_tokens = in_tok
|
||||
if isinstance(out_tok, int):
|
||||
result.sample_output_tokens = out_tok
|
||||
in_tok, out_tok = _extract_openai_usage(payload)
|
||||
if in_tok is not None:
|
||||
result.sample_input_tokens = in_tok
|
||||
if out_tok is not None:
|
||||
result.sample_output_tokens = out_tok
|
||||
except Exception:
|
||||
pass
|
||||
elapsed_ms = probe_resp.elapsed.total_seconds() * 1000.0
|
||||
result.sample_latency_ms = int(max(0.0, round(elapsed_ms)))
|
||||
if (
|
||||
result.tokens.limit is not None
|
||||
or result.requests.limit is not None
|
||||
or result.sample_input_tokens is not None
|
||||
or result.sample_output_tokens is not None
|
||||
):
|
||||
break
|
||||
|
||||
return result
|
||||
|
||||
|
||||
async def _fetch_ollama(base_url: str | None, api_key: str | None) -> ProviderUsageLive:
|
||||
base = (base_url or "http://localhost:11434").rstrip("/")
|
||||
base = _normalize_base(
|
||||
base_url,
|
||||
"http://localhost:11434",
|
||||
strip_suffixes=("/api",),
|
||||
)
|
||||
now = utcnow()
|
||||
result = ProviderUsageLive(provider="ollama", account_key="", checked_at=now, reachable=False)
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,44 @@
|
|||
# ruff: noqa: INP001
|
||||
"""Unit tests for provider usage parsing and normalization helpers."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from app.services.provider_usage import (
|
||||
_extract_openai_usage,
|
||||
_normalize_base,
|
||||
_parse_openai_reset,
|
||||
)
|
||||
|
||||
|
||||
def test_normalize_base_strips_known_suffixes() -> None:
|
||||
assert (
|
||||
_normalize_base(
|
||||
"https://api.openai.com/v1/",
|
||||
"https://api.openai.com",
|
||||
strip_suffixes=("/v1",),
|
||||
)
|
||||
== "https://api.openai.com"
|
||||
)
|
||||
assert (
|
||||
_normalize_base(
|
||||
"https://ollama.com/api/",
|
||||
"http://localhost:11434",
|
||||
strip_suffixes=("/api",),
|
||||
)
|
||||
== "https://ollama.com"
|
||||
)
|
||||
|
||||
|
||||
def test_extract_openai_usage_supports_responses_shape() -> None:
|
||||
payload = {"usage": {"input_tokens": 12, "output_tokens": 3}}
|
||||
assert _extract_openai_usage(payload) == (12, 3)
|
||||
|
||||
|
||||
def test_extract_openai_usage_supports_chat_completions_shape() -> None:
|
||||
payload = {"usage": {"prompt_tokens": 9, "completion_tokens": 1}}
|
||||
assert _extract_openai_usage(payload) == (9, 1)
|
||||
|
||||
|
||||
def test_parse_openai_reset_duration_format() -> None:
|
||||
reset_at = _parse_openai_reset("6m0s")
|
||||
assert reset_at is not None
|
||||
Loading…
Reference in New Issue