175 lines
5.3 KiB
Python
175 lines
5.3 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Validate question-bank variety before building the bundled database.
|
|
|
|
Checks:
|
|
* no exact duplicate visible question text
|
|
* heavily templated stems stay below a configured cap
|
|
|
|
Run from the repo root:
|
|
python3 seed/validate_question_variety.py
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
import re
|
|
import sqlite3
|
|
import sys
|
|
from collections import defaultdict
|
|
from dataclasses import dataclass
|
|
from pathlib import Path
|
|
|
|
ROOT = Path(__file__).resolve().parents[1]
|
|
QUESTIONS_DIR = ROOT / "seed" / "questions"
|
|
DB_PATH = ROOT / "app" / "src" / "main" / "assets" / "database" / "app.db"
|
|
|
|
MAX_TEMPLATE_STEM_ROWS = 25
|
|
TEMPLATE_STEMS = (
|
|
"what do you wish i understood about",
|
|
"what do you need from me when",
|
|
)
|
|
|
|
SUSPECT_GRAMMAR_PATTERNS = (
|
|
(
|
|
"placeholder phrase",
|
|
re.compile(r"\b(deeper fears|deep fears|emotional needs you hide|needs you hide|of you you protect)\b"),
|
|
),
|
|
(
|
|
"subject/verb agreement",
|
|
re.compile(r"\bdoes (in-laws|special needs|warm greetings|coping habits|stress recovery routines)\b"),
|
|
),
|
|
(
|
|
"abstract stress intensity",
|
|
re.compile(
|
|
r"\b(asking for help|low energy days|unexpected problems|busy weeks|feeling supported|"
|
|
r"feeling alone in stress|health worries|burnout signs|resetting after stress) is high\b"
|
|
),
|
|
),
|
|
(
|
|
"abstract stress presence",
|
|
re.compile(r"\b(asking for help|feeling supported) is present\b"),
|
|
),
|
|
(
|
|
"plural shows up",
|
|
re.compile(r"\b(unexpected problems|health worries) shows up\b"),
|
|
),
|
|
)
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class QuestionRecord:
|
|
id: str
|
|
text: str
|
|
source: str
|
|
|
|
|
|
def normalize_text(text: str) -> str:
|
|
lowered = text.strip().lower()
|
|
lowered = re.sub(r"\s+", " ", lowered)
|
|
lowered = re.sub(r"[^\w\s']", "", lowered)
|
|
return lowered
|
|
|
|
|
|
def matching_stem(normalized_text: str) -> str | None:
|
|
for stem in TEMPLATE_STEMS:
|
|
if normalized_text.startswith(stem):
|
|
return stem
|
|
return None
|
|
|
|
|
|
def load_json_records(questions_dir: Path) -> list[QuestionRecord]:
|
|
records: list[QuestionRecord] = []
|
|
for path in sorted(questions_dir.glob("*.json")):
|
|
with path.open() as f:
|
|
data = json.load(f)
|
|
for q in data.get("questions", []):
|
|
text = q.get("text", "")
|
|
if text:
|
|
records.append(QuestionRecord(q["id"], text, str(path.relative_to(ROOT))))
|
|
return records
|
|
|
|
|
|
def load_db_records(db_path: Path) -> list[QuestionRecord]:
|
|
if not db_path.exists():
|
|
return []
|
|
con = sqlite3.connect(db_path)
|
|
try:
|
|
rows = con.execute(
|
|
"SELECT id, text FROM question WHERE status='active' AND text<>'' ORDER BY id"
|
|
).fetchall()
|
|
finally:
|
|
con.close()
|
|
return [QuestionRecord(qid, text, str(db_path.relative_to(ROOT))) for qid, text in rows]
|
|
|
|
|
|
def validate_records(records: list[QuestionRecord], label: str) -> list[str]:
|
|
errors: list[str] = []
|
|
|
|
by_text: dict[str, list[QuestionRecord]] = defaultdict(list)
|
|
by_stem: dict[str, list[QuestionRecord]] = defaultdict(list)
|
|
|
|
for record in records:
|
|
normalized = normalize_text(record.text)
|
|
by_text[normalized].append(record)
|
|
stem = matching_stem(normalized)
|
|
if stem:
|
|
by_stem[stem].append(record)
|
|
|
|
duplicates = {text: items for text, items in by_text.items() if len(items) > 1}
|
|
if duplicates:
|
|
extra_count = sum(len(items) - 1 for items in duplicates.values())
|
|
errors.append(f"{label}: {extra_count} duplicate visible question texts")
|
|
for _, items in sorted(duplicates.items(), key=lambda item: (-len(item[1]), item[0]))[:10]:
|
|
ids = ", ".join(record.id for record in items[:6])
|
|
if len(items) > 6:
|
|
ids += ", ..."
|
|
errors.append(f" {len(items)}x {items[0].text!r} ({ids})")
|
|
|
|
for stem, items in sorted(by_stem.items()):
|
|
if len(items) > MAX_TEMPLATE_STEM_ROWS:
|
|
errors.append(
|
|
f"{label}: stem {stem!r} appears {len(items)} times "
|
|
f"(max {MAX_TEMPLATE_STEM_ROWS})"
|
|
)
|
|
sample_ids = ", ".join(record.id for record in items[:8])
|
|
errors.append(f" sample ids: {sample_ids}")
|
|
|
|
for name, pattern in SUSPECT_GRAMMAR_PATTERNS:
|
|
matches = [record for record in records if pattern.search(record.text.lower())]
|
|
if matches:
|
|
ids = ", ".join(record.id for record in matches[:8])
|
|
if len(matches) > 8:
|
|
ids += ", ..."
|
|
errors.append(f"{label}: {len(matches)} suspect {name} rows ({ids})")
|
|
|
|
return errors
|
|
|
|
|
|
def main() -> int:
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("--json-only", action="store_true", help="Skip validating the asset DB")
|
|
args = parser.parse_args()
|
|
|
|
checks = [("JSON", load_json_records(QUESTIONS_DIR))]
|
|
if not args.json_only:
|
|
db_records = load_db_records(DB_PATH)
|
|
if db_records:
|
|
checks.append(("DB", db_records))
|
|
|
|
errors: list[str] = []
|
|
for label, records in checks:
|
|
errors.extend(validate_records(records, label))
|
|
|
|
if errors:
|
|
print("Question variety check failed:")
|
|
print("\n".join(errors))
|
|
return 1
|
|
|
|
print("Question variety check passed")
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|