Closer/seed/validate_question_variety.py

175 lines
5.3 KiB
Python
Raw Permalink Normal View History

#!/usr/bin/env python3
"""
Validate question-bank variety before building the bundled database.
Checks:
* no exact duplicate visible question text
* heavily templated stems stay below a configured cap
Run from the repo root:
python3 seed/validate_question_variety.py
"""
from __future__ import annotations
import argparse
import json
import re
import sqlite3
import sys
from collections import defaultdict
from dataclasses import dataclass
from pathlib import Path
ROOT = Path(__file__).resolve().parents[1]
QUESTIONS_DIR = ROOT / "seed" / "questions"
DB_PATH = ROOT / "app" / "src" / "main" / "assets" / "database" / "app.db"
MAX_TEMPLATE_STEM_ROWS = 25
TEMPLATE_STEMS = (
"what do you wish i understood about",
"what do you need from me when",
)
SUSPECT_GRAMMAR_PATTERNS = (
(
"placeholder phrase",
re.compile(r"\b(deeper fears|deep fears|emotional needs you hide|needs you hide|of you you protect)\b"),
),
(
"subject/verb agreement",
re.compile(r"\bdoes (in-laws|special needs|warm greetings|coping habits|stress recovery routines)\b"),
),
(
"abstract stress intensity",
re.compile(
r"\b(asking for help|low energy days|unexpected problems|busy weeks|feeling supported|"
r"feeling alone in stress|health worries|burnout signs|resetting after stress) is high\b"
),
),
(
"abstract stress presence",
re.compile(r"\b(asking for help|feeling supported) is present\b"),
),
(
"plural shows up",
re.compile(r"\b(unexpected problems|health worries) shows up\b"),
),
)
@dataclass(frozen=True)
class QuestionRecord:
id: str
text: str
source: str
def normalize_text(text: str) -> str:
lowered = text.strip().lower()
lowered = re.sub(r"\s+", " ", lowered)
lowered = re.sub(r"[^\w\s']", "", lowered)
return lowered
def matching_stem(normalized_text: str) -> str | None:
for stem in TEMPLATE_STEMS:
if normalized_text.startswith(stem):
return stem
return None
def load_json_records(questions_dir: Path) -> list[QuestionRecord]:
records: list[QuestionRecord] = []
for path in sorted(questions_dir.glob("*.json")):
with path.open() as f:
data = json.load(f)
for q in data.get("questions", []):
text = q.get("text", "")
if text:
records.append(QuestionRecord(q["id"], text, str(path.relative_to(ROOT))))
return records
def load_db_records(db_path: Path) -> list[QuestionRecord]:
if not db_path.exists():
return []
con = sqlite3.connect(db_path)
try:
rows = con.execute(
"SELECT id, text FROM question WHERE status='active' AND text<>'' ORDER BY id"
).fetchall()
finally:
con.close()
return [QuestionRecord(qid, text, str(db_path.relative_to(ROOT))) for qid, text in rows]
def validate_records(records: list[QuestionRecord], label: str) -> list[str]:
errors: list[str] = []
by_text: dict[str, list[QuestionRecord]] = defaultdict(list)
by_stem: dict[str, list[QuestionRecord]] = defaultdict(list)
for record in records:
normalized = normalize_text(record.text)
by_text[normalized].append(record)
stem = matching_stem(normalized)
if stem:
by_stem[stem].append(record)
duplicates = {text: items for text, items in by_text.items() if len(items) > 1}
if duplicates:
extra_count = sum(len(items) - 1 for items in duplicates.values())
errors.append(f"{label}: {extra_count} duplicate visible question texts")
for _, items in sorted(duplicates.items(), key=lambda item: (-len(item[1]), item[0]))[:10]:
ids = ", ".join(record.id for record in items[:6])
if len(items) > 6:
ids += ", ..."
errors.append(f" {len(items)}x {items[0].text!r} ({ids})")
for stem, items in sorted(by_stem.items()):
if len(items) > MAX_TEMPLATE_STEM_ROWS:
errors.append(
f"{label}: stem {stem!r} appears {len(items)} times "
f"(max {MAX_TEMPLATE_STEM_ROWS})"
)
sample_ids = ", ".join(record.id for record in items[:8])
errors.append(f" sample ids: {sample_ids}")
for name, pattern in SUSPECT_GRAMMAR_PATTERNS:
matches = [record for record in records if pattern.search(record.text.lower())]
if matches:
ids = ", ".join(record.id for record in matches[:8])
if len(matches) > 8:
ids += ", ..."
errors.append(f"{label}: {len(matches)} suspect {name} rows ({ids})")
return errors
def main() -> int:
parser = argparse.ArgumentParser()
parser.add_argument("--json-only", action="store_true", help="Skip validating the asset DB")
args = parser.parse_args()
checks = [("JSON", load_json_records(QUESTIONS_DIR))]
if not args.json_only:
db_records = load_db_records(DB_PATH)
if db_records:
checks.append(("DB", db_records))
errors: list[str] = []
for label, records in checks:
errors.extend(validate_records(records, label))
if errors:
print("Question variety check failed:")
print("\n".join(errors))
return 1
print("Question variety check passed")
return 0
if __name__ == "__main__":
sys.exit(main())