#!/usr/bin/env python3 """ Validate question-bank variety before building the bundled database. Checks: * no exact duplicate visible question text * heavily templated stems stay below a configured cap Run from the repo root: python3 seed/validate_question_variety.py """ from __future__ import annotations import argparse import json import re import sqlite3 import sys from collections import defaultdict from dataclasses import dataclass from pathlib import Path ROOT = Path(__file__).resolve().parents[1] QUESTIONS_DIR = ROOT / "seed" / "questions" DB_PATH = ROOT / "app" / "src" / "main" / "assets" / "database" / "app.db" MAX_TEMPLATE_STEM_ROWS = 25 TEMPLATE_STEMS = ( "what do you wish i understood about", "what do you need from me when", ) SUSPECT_GRAMMAR_PATTERNS = ( ( "placeholder phrase", re.compile(r"\b(deeper fears|deep fears|emotional needs you hide|needs you hide|of you you protect)\b"), ), ( "subject/verb agreement", re.compile(r"\bdoes (in-laws|special needs|warm greetings|coping habits|stress recovery routines)\b"), ), ( "abstract stress intensity", re.compile( r"\b(asking for help|low energy days|unexpected problems|busy weeks|feeling supported|" r"feeling alone in stress|health worries|burnout signs|resetting after stress) is high\b" ), ), ( "abstract stress presence", re.compile(r"\b(asking for help|feeling supported) is present\b"), ), ( "plural shows up", re.compile(r"\b(unexpected problems|health worries) shows up\b"), ), ) @dataclass(frozen=True) class QuestionRecord: id: str text: str source: str def normalize_text(text: str) -> str: lowered = text.strip().lower() lowered = re.sub(r"\s+", " ", lowered) lowered = re.sub(r"[^\w\s']", "", lowered) return lowered def matching_stem(normalized_text: str) -> str | None: for stem in TEMPLATE_STEMS: if normalized_text.startswith(stem): return stem return None def load_json_records(questions_dir: Path) -> list[QuestionRecord]: records: list[QuestionRecord] = [] for path in sorted(questions_dir.glob("*.json")): with path.open() as f: data = json.load(f) for q in data.get("questions", []): text = q.get("text", "") if text: records.append(QuestionRecord(q["id"], text, str(path.relative_to(ROOT)))) return records def load_db_records(db_path: Path) -> list[QuestionRecord]: if not db_path.exists(): return [] con = sqlite3.connect(db_path) try: rows = con.execute( "SELECT id, text FROM question WHERE status='active' AND text<>'' ORDER BY id" ).fetchall() finally: con.close() return [QuestionRecord(qid, text, str(db_path.relative_to(ROOT))) for qid, text in rows] def validate_records(records: list[QuestionRecord], label: str) -> list[str]: errors: list[str] = [] by_text: dict[str, list[QuestionRecord]] = defaultdict(list) by_stem: dict[str, list[QuestionRecord]] = defaultdict(list) for record in records: normalized = normalize_text(record.text) by_text[normalized].append(record) stem = matching_stem(normalized) if stem: by_stem[stem].append(record) duplicates = {text: items for text, items in by_text.items() if len(items) > 1} if duplicates: extra_count = sum(len(items) - 1 for items in duplicates.values()) errors.append(f"{label}: {extra_count} duplicate visible question texts") for _, items in sorted(duplicates.items(), key=lambda item: (-len(item[1]), item[0]))[:10]: ids = ", ".join(record.id for record in items[:6]) if len(items) > 6: ids += ", ..." errors.append(f" {len(items)}x {items[0].text!r} ({ids})") for stem, items in sorted(by_stem.items()): if len(items) > MAX_TEMPLATE_STEM_ROWS: errors.append( f"{label}: stem {stem!r} appears {len(items)} times " f"(max {MAX_TEMPLATE_STEM_ROWS})" ) sample_ids = ", ".join(record.id for record in items[:8]) errors.append(f" sample ids: {sample_ids}") for name, pattern in SUSPECT_GRAMMAR_PATTERNS: matches = [record for record in records if pattern.search(record.text.lower())] if matches: ids = ", ".join(record.id for record in matches[:8]) if len(matches) > 8: ids += ", ..." errors.append(f"{label}: {len(matches)} suspect {name} rows ({ids})") return errors def main() -> int: parser = argparse.ArgumentParser() parser.add_argument("--json-only", action="store_true", help="Skip validating the asset DB") args = parser.parse_args() checks = [("JSON", load_json_records(QUESTIONS_DIR))] if not args.json_only: db_records = load_db_records(DB_PATH) if db_records: checks.append(("DB", db_records)) errors: list[str] = [] for label, records in checks: errors.extend(validate_records(records, label)) if errors: print("Question variety check failed:") print("\n".join(errors)) return 1 print("Question variety check passed") return 0 if __name__ == "__main__": sys.exit(main())