Closer/seed/validate_question_variety.py

#!/usr/bin/env python3
"""
Validate question-bank variety before building the bundled database.

Checks:
  * no exact duplicate visible question text
  * heavily templated stems stay below a configured cap

Run from the repo root:
  python3 seed/validate_question_variety.py
"""
from __future__ import annotations

import argparse
import json
import re
import sqlite3
import sys
from collections import defaultdict
from dataclasses import dataclass
from pathlib import Path

ROOT = Path(__file__).resolve().parents[1]
QUESTIONS_DIR = ROOT / "seed" / "questions"
DB_PATH = ROOT / "app" / "src" / "main" / "assets" / "database" / "app.db"

MAX_TEMPLATE_STEM_ROWS = 25
TEMPLATE_STEMS = (
    "what do you wish i understood about",
    "what do you need from me when",
)

SUSPECT_GRAMMAR_PATTERNS = (
    (
        "placeholder phrase",
        re.compile(r"\b(deeper fears|deep fears|emotional needs you hide|needs you hide|of you you protect)\b"),
    ),
    (
        "subject/verb agreement",
        re.compile(r"\bdoes (in-laws|special needs|warm greetings|coping habits|stress recovery routines)\b"),
    ),
    (
        "abstract stress intensity",
        re.compile(
            r"\b(asking for help|low energy days|unexpected problems|busy weeks|feeling supported|"
            r"feeling alone in stress|health worries|burnout signs|resetting after stress) is high\b"
        ),
    ),
    (
        "abstract stress presence",
        re.compile(r"\b(asking for help|feeling supported) is present\b"),
    ),
    (
        "plural shows up",
        re.compile(r"\b(unexpected problems|health worries) shows up\b"),
    ),
)


@dataclass(frozen=True)
class QuestionRecord:
    id: str
    text: str
    source: str


def normalize_text(text: str) -> str:
    lowered = text.strip().lower()
    lowered = re.sub(r"\s+", " ", lowered)
    lowered = re.sub(r"[^\w\s']", "", lowered)
    return lowered


def matching_stem(normalized_text: str) -> str | None:
    for stem in TEMPLATE_STEMS:
        if normalized_text.startswith(stem):
            return stem
    return None


def load_json_records(questions_dir: Path) -> list[QuestionRecord]:
    records: list[QuestionRecord] = []
    for path in sorted(questions_dir.glob("*.json")):
        with path.open() as f:
            data = json.load(f)
        for q in data.get("questions", []):
            text = q.get("text", "")
            if text:
                records.append(QuestionRecord(q["id"], text, str(path.relative_to(ROOT))))
    return records


def load_db_records(db_path: Path) -> list[QuestionRecord]:
    if not db_path.exists():
        return []
    con = sqlite3.connect(db_path)
    try:
        rows = con.execute(
            "SELECT id, text FROM question WHERE status='active' AND text<>'' ORDER BY id"
        ).fetchall()
    finally:
        con.close()
    return [QuestionRecord(qid, text, str(db_path.relative_to(ROOT))) for qid, text in rows]


def validate_records(records: list[QuestionRecord], label: str) -> list[str]:
    errors: list[str] = []

    by_text: dict[str, list[QuestionRecord]] = defaultdict(list)
    by_stem: dict[str, list[QuestionRecord]] = defaultdict(list)

    for record in records:
        normalized = normalize_text(record.text)
        by_text[normalized].append(record)
        stem = matching_stem(normalized)
        if stem:
            by_stem[stem].append(record)

    duplicates = {text: items for text, items in by_text.items() if len(items) > 1}
    if duplicates:
        extra_count = sum(len(items) - 1 for items in duplicates.values())
        errors.append(f"{label}: {extra_count} duplicate visible question texts")
        for _, items in sorted(duplicates.items(), key=lambda item: (-len(item[1]), item[0]))[:10]:
            ids = ", ".join(record.id for record in items[:6])
            if len(items) > 6:
                ids += ", ..."
            errors.append(f"  {len(items)}x {items[0].text!r} ({ids})")

    for stem, items in sorted(by_stem.items()):
        if len(items) > MAX_TEMPLATE_STEM_ROWS:
            errors.append(
                f"{label}: stem {stem!r} appears {len(items)} times "
                f"(max {MAX_TEMPLATE_STEM_ROWS})"
            )
            sample_ids = ", ".join(record.id for record in items[:8])
            errors.append(f"  sample ids: {sample_ids}")

    for name, pattern in SUSPECT_GRAMMAR_PATTERNS:
        matches = [record for record in records if pattern.search(record.text.lower())]
        if matches:
            ids = ", ".join(record.id for record in matches[:8])
            if len(matches) > 8:
                ids += ", ..."
            errors.append(f"{label}: {len(matches)} suspect {name} rows ({ids})")

    return errors


def main() -> int:
    parser = argparse.ArgumentParser()
    parser.add_argument("--json-only", action="store_true", help="Skip validating the asset DB")
    args = parser.parse_args()

    checks = [("JSON", load_json_records(QUESTIONS_DIR))]
    if not args.json_only:
        db_records = load_db_records(DB_PATH)
        if db_records:
            checks.append(("DB", db_records))

    errors: list[str] = []
    for label, records in checks:
        errors.extend(validate_records(records, label))

    if errors:
        print("Question variety check failed:")
        print("\n".join(errors))
        return 1

    print("Question variety check passed")
    return 0


if __name__ == "__main__":
    sys.exit(main())