diff --git a/scripts/validate_data.py b/scripts/validate_data.py new file mode 100644 index 0000000..0a227ed --- /dev/null +++ b/scripts/validate_data.py @@ -0,0 +1,799 @@ +#!/usr/bin/env python3 +""" +Validate JSON data files using Pydantic models. + +This script validates all data files in kjvstudy_org/data/ using Pydantic models +for type safety and validation. Pydantic provides better error messages and +integrates naturally with FastAPI. + +Usage: + python scripts/validate_data.py # Validate all files + python scripts/validate_data.py --file bible_metadata.json # Validate specific file + python scripts/validate_data.py --verbose # Show detailed output + python scripts/validate_data.py --generate-schemas # Generate JSON schemas + +Requirements: + pip install pydantic (already installed with FastAPI) +""" + +import json +import sys +from pathlib import Path +from typing import Dict, List, Tuple, Optional + +try: + from pydantic import BaseModel, RootModel, Field, field_validator, ValidationError +except ImportError: + print("Error: pydantic package not found") + print("Install with: pip install pydantic") + sys.exit(1) + +# Path to data directory +DATA_DIR = Path(__file__).parent.parent / "kjvstudy_org" / "data" +SCHEMAS_DIR = DATA_DIR / "schemas" + + +# ============================================================================ +# Pydantic Models for Data Validation +# ============================================================================ + +class BibleMetadata(BaseModel): + """Schema for bible_metadata.json""" + old_testament_books: List[str] = Field(..., min_length=39, max_length=39) + new_testament_books: List[str] = Field(..., min_length=27, max_length=27) + book_abbreviations: Dict[str, str] = Field(..., min_length=1) + + @field_validator('old_testament_books', 'new_testament_books') + @classmethod + def check_unique_books(cls, v): + if len(v) != len(set(v)): + raise ValueError("Duplicate book names found") + return v + + +class WordStudy(BaseModel): + """Schema for individual word study entry""" + ot_term: Optional[str] = Field(None, min_length=1) + ot_transliteration: Optional[str] = Field(None, min_length=1) + ot_meaning: Optional[str] = Field(None, min_length=1) + ot_note: Optional[str] = Field(None, min_length=1) + nt_term: Optional[str] = Field(None, min_length=1) + nt_transliteration: Optional[str] = Field(None, min_length=1) + nt_meaning: Optional[str] = Field(None, min_length=1) + nt_note: Optional[str] = Field(None, min_length=1) + + +class WordStudies(RootModel[Dict[str, WordStudy]]): + """Schema for word_studies.json""" + root: Dict[str, WordStudy] + + +class CatalogEntry(BaseModel): + """Schema for study guide catalog entry""" + title: str = Field(..., min_length=1) + description: str = Field(..., min_length=1) + slug: str = Field(..., pattern=r'^[a-z-]+$') + verses: List[str] + + @field_validator('verses') + @classmethod + def check_verse_format(cls, v): + import re + pattern = r'^[A-Za-z0-9 ]+ \d+:\d+(-\d+)?$' + for verse in v: + if not re.match(pattern, verse): + raise ValueError(f"Invalid verse reference format: {verse}") + return v + + +class StudySection(BaseModel): + """Schema for study guide section""" + title: str = Field(..., min_length=1) + verses: List[str] + content: str = Field(..., min_length=1) + + @field_validator('verses') + @classmethod + def check_verse_format(cls, v): + import re + pattern = r'^[A-Za-z0-9 ]+ \d+:\d+(-\d+)?$' + for verse in v: + if not re.match(pattern, verse): + raise ValueError(f"Invalid verse reference format: {verse}") + return v + + +class GuideContent(BaseModel): + """Schema for study guide content""" + title: str = Field(..., min_length=1) + description: str = Field(..., min_length=1) + sections: List[StudySection] = Field(..., min_length=1) + + +class StudyGuideFile(BaseModel): + """Schema for a single study guide file""" + content: GuideContent + catalog_entry: Optional[CatalogEntry] = None + category: Optional[str] = None + + +class TopicsFile(BaseModel): + """Schema for a single topics file""" + root: Dict[str, dict] + + +class ReadingPlanFile(BaseModel): + """Schema for a single reading plan file""" + plan: Dict[str, List[Dict[str, object]]] + + +class VerseCommentaryEntry(BaseModel): + """Schema for verse commentary entry""" + analysis: str = Field(..., min_length=1) + historical: str = Field(..., min_length=1) + questions: List[str] = Field(..., min_length=1) + + +class VerseCommentaryBook(BaseModel): + """Schema for a single verse commentary book file""" + book: str = Field(..., min_length=1) + commentary: Dict[str, Dict[str, VerseCommentaryEntry]] = Field(..., min_length=1) + + @field_validator('commentary') + @classmethod + def check_numeric_keys(cls, v): + for chapter_key, verses in v.items(): + if not str(chapter_key).isdigit(): + raise ValueError(f"Invalid chapter key: {chapter_key}") + if not isinstance(verses, dict) or len(verses) == 0: + raise ValueError(f"Chapter {chapter_key} must contain verse entries") + for verse_key, entry in verses.items(): + if not str(verse_key).isdigit(): + raise ValueError(f"Invalid verse key: {verse_key}") + if not isinstance(entry, (dict, BaseModel)): + raise ValueError(f"Verse {chapter_key}:{verse_key} must be an object") + return v + + +class Devotional(BaseModel): + """Schema for verse devotional content""" + title: str = Field(..., min_length=1, max_length=100) + theme: str = Field(..., min_length=1, max_length=50) + opening: str = Field(..., min_length=10) + meditation: str = Field(..., min_length=20) + application: str = Field(..., min_length=10) + prayer: str = Field(..., min_length=10) + + @field_validator('prayer') + @classmethod + def prayer_ends_with_amen(cls, v): + if not v.strip().endswith('Amen.'): + raise ValueError("Prayer must end with 'Amen.'") + return v + + +class FeaturedVerse(BaseModel): + """Schema for individual featured verse with optional devotional""" + book: str = Field(..., min_length=1) + chapter: int = Field(..., ge=1) + verse: int = Field(..., ge=1) + devotional: Optional[Devotional] = None + + +class FeaturedVerses(BaseModel): + """Schema for featured_verses.json - 365 verses with devotionals""" + verses: List[FeaturedVerse] = Field(..., min_length=1) + + @field_validator('verses') + @classmethod + def check_devotional_coverage(cls, v): + # Warn if not all verses have devotionals + with_devotional = sum(1 for verse in v if verse.devotional is not None) + if with_devotional < len(v): + # This is just informational, not an error + pass + return v + + +class RedLetterVerses(BaseModel): + """Schema for red_letter_verses.json""" + description: str = Field(..., min_length=1) + note: str = Field(..., min_length=1) + verses: Dict[str, str] = Field(..., min_length=1) + + @field_validator('verses') + @classmethod + def check_verses(cls, v): + import re + # Validate verse reference format + pattern = r"^[A-Za-z0-9 ']+ \d+:\d+$" + for key, value in v.items(): + if not re.match(pattern, key): + raise ValueError(f"Invalid verse reference key: {key}") + # Value must be either "full" or a non-empty string + if value != "full" and (not isinstance(value, str) or len(value) == 0): + raise ValueError(f"Invalid value for {key}: must be 'full' or a non-empty string") + return v + + +class ResourceSlugs(BaseModel): + """Schema for resource_slugs.json""" + study_guides: List[str] + angels: List[str] + prophets: List[str] + names_of_god: List[str] + parables: List[str] + covenants: List[str] + apostles: List[str] + women: List[str] + festivals: List[str] + fruits_of_spirit: List[str] + + @field_validator('*') + @classmethod + def check_slugs(cls, v): + # Check for duplicates + if len(v) != len(set(v)): + raise ValueError("Duplicate slugs found") + # Check slug format + import re + pattern = r'^[a-z-]+$' + for slug in v: + if not re.match(pattern, slug): + raise ValueError(f"Invalid slug format: {slug}") + return v + + +class PoetryBookData(BaseModel): + """Schema for individual book poetry data""" + is_poetry: bool = Field(..., description="Whether the entire book is poetry") + poetry_chapters: List[int] | str = Field(..., description="List of chapter numbers that are poetry, or 'all'") + stanza_breaks: Dict[str, List[int]] = Field(..., description="Map of chapter number to list of verse numbers with stanza breaks") + + @field_validator('poetry_chapters') + @classmethod + def check_chapters_sorted(cls, v): + # Allow "all" as a special value for entirely poetry books + if v == "all": + return v + if v != sorted(v): + raise ValueError("poetry_chapters must be sorted") + if len(v) != len(set(v)): + raise ValueError("Duplicate chapter numbers found") + return v + + @field_validator('stanza_breaks') + @classmethod + def check_stanza_breaks(cls, v): + for chapter_key, verses in v.items(): + if not chapter_key.isdigit(): + raise ValueError(f"Invalid chapter key: {chapter_key}") + if verses != sorted(verses): + raise ValueError(f"Stanza breaks for chapter {chapter_key} must be sorted") + if len(verses) != len(set(verses)): + raise ValueError(f"Duplicate verse numbers in chapter {chapter_key}") + return v + + +class PoetryFormatting(BaseModel): + """Schema for poetry_formatting.json""" + books: Dict[str, PoetryBookData] = Field(..., min_length=1) + + @field_validator('books') + @classmethod + def check_valid_books(cls, v): + # Many books have poetic sections (Psalms, Prophets, NT hymns, etc.) + # Just validate that book names are valid Bible books + valid_books = { + 'Genesis', 'Exodus', 'Leviticus', 'Numbers', 'Deuteronomy', + 'Joshua', 'Judges', 'Ruth', '1 Samuel', '2 Samuel', '1 Kings', '2 Kings', + '1 Chronicles', '2 Chronicles', 'Ezra', 'Nehemiah', 'Esther', + 'Job', 'Psalms', 'Proverbs', 'Ecclesiastes', 'Song of Solomon', + 'Isaiah', 'Jeremiah', 'Lamentations', 'Ezekiel', 'Daniel', + 'Hosea', 'Joel', 'Amos', 'Obadiah', 'Jonah', 'Micah', 'Nahum', + 'Habakkuk', 'Zephaniah', 'Haggai', 'Zechariah', 'Malachi', + 'Matthew', 'Mark', 'Luke', 'John', 'Acts', + 'Romans', '1 Corinthians', '2 Corinthians', 'Galatians', 'Ephesians', + 'Philippians', 'Colossians', '1 Thessalonians', '2 Thessalonians', + '1 Timothy', '2 Timothy', 'Titus', 'Philemon', 'Hebrews', + 'James', '1 Peter', '2 Peter', '1 John', '2 John', '3 John', 'Jude', 'Revelation' + } + for book_name in v.keys(): + if book_name not in valid_books: + raise ValueError(f"Invalid book name: {book_name}") + return v + + +class OutlineSection(BaseModel): + """Schema for book outline section""" + section: str = Field(..., min_length=1) + chapters: str = Field(..., min_length=1) + description: str = Field(..., min_length=1) + + +class KeyTheme(BaseModel): + """Schema for book key theme""" + theme: str = Field(..., min_length=1) + description: str = Field(..., min_length=1) + + +class KeyVerse(BaseModel): + """Schema for book key verse""" + reference: str = Field(..., min_length=1) + text: str = Field(..., min_length=1) + + +class BookIntroduction(BaseModel): + """Schema for individual book introduction file""" + name: str = Field(..., min_length=1) + abbreviation: str = Field(..., min_length=1) + testament: str = Field(..., pattern=r'^(Old Testament|New Testament)$') + position: int = Field(..., ge=1, le=66) + chapters: int = Field(..., ge=1) + category: str = Field(..., min_length=1) + author: str = Field(..., min_length=1) + date_written: str = Field(..., min_length=1) + introduction: str = Field(..., min_length=1) + outline: List[OutlineSection] = Field(..., min_length=1) + key_themes: List[KeyTheme] = Field(..., min_length=1) + key_verses: List[KeyVerse] = Field(..., min_length=1) + christ_in_book: Optional[str] = None + + +# ============================================================================ +# Validation Logic +# ============================================================================ + +# Mapping of data files to their Pydantic models +MODEL_MAPPING = { + "bible_metadata.json": BibleMetadata, + "word_studies.json": WordStudies, + "study_guides": StudyGuideFile, + "verse_commentary": VerseCommentaryBook, + "topics": TopicsFile, + "reading_plans": ReadingPlanFile, + "featured_verses.json": FeaturedVerses, + "red_letter_verses.json": RedLetterVerses, + "resource_slugs.json": ResourceSlugs, + "poetry_formatting.json": PoetryFormatting, +} + + +def load_json(file_path: Path) -> Tuple[dict, Optional[str]]: + """Load JSON file and return data and error message if any.""" + try: + with open(file_path, 'r', encoding='utf-8') as f: + return json.load(f), None + except json.JSONDecodeError as e: + return None, f"JSON syntax error: {e}" + except Exception as e: + return None, f"Error loading file: {e}" + + +def validate_file(data_file: str, verbose: bool = False) -> bool: + """Validate a single data file using its Pydantic model.""" + if data_file == "verse_commentary": + return validate_verse_commentary_directory(verbose) + if data_file == "study_guides": + return validate_study_guides_directory(verbose) + if data_file == "topics": + return validate_topics_directory(verbose) + if data_file == "reading_plans": + return validate_reading_plans_directory(verbose) + + if data_file not in MODEL_MAPPING: + if verbose: + print(f"⚠️ {data_file}: No validation model defined (skipped)") + return True + + model_class = MODEL_MAPPING[data_file] + data_path = DATA_DIR / data_file + + # Check if file exists + if not data_path.exists(): + print(f"❌ {data_file}: File not found at {data_path}") + return False + + # Load data file + data, error = load_json(data_path) + if error: + print(f"❌ {data_file}: {error}") + return False + + # Validate using Pydantic model + try: + # For RootModel subclasses, pass data directly to constructor + # For regular BaseModel subclasses, unpack as kwargs + if issubclass(model_class, RootModel): + model_class(data) + else: + model_class(**data) + + print(f"✅ {data_file}: Valid") + if verbose: + print(f" Model: {model_class.__name__}") + print(f" Size: {data_path.stat().st_size:,} bytes") + return True + + except ValidationError as e: + print(f"❌ {data_file}: Validation failed") + for error_detail in e.errors(): + location = " -> ".join(str(loc) for loc in error_detail['loc']) + print(f" {location}: {error_detail['msg']}") + if verbose and 'ctx' in error_detail: + print(f" Context: {error_detail['ctx']}") + return False + + except Exception as e: + print(f"❌ {data_file}: Unexpected error") + print(f" Error: {str(e)}") + return False + + +def validate_verse_commentary_directory(verbose: bool = False) -> bool: + """Validate all per-book verse commentary files.""" + dir_path = DATA_DIR / "verse_commentary" + if not dir_path.exists(): + print(f"❌ verse_commentary: Directory not found at {dir_path}") + return False + + passed = 0 + failed = 0 + + for file_path in sorted(dir_path.glob("*.json")): + data, error = load_json(file_path) + if error: + print(f"❌ {file_path.name}: {error}") + failed += 1 + continue + + try: + VerseCommentaryBook(**data) + if verbose: + print(f"✅ {file_path.name}: Valid") + passed += 1 + except ValidationError as e: + print(f"❌ {file_path.name}: Validation failed") + for error_detail in e.errors(): + location = " -> ".join(str(loc) for loc in error_detail['loc']) + print(f" {location}: {error_detail['msg']}") + failed += 1 + except Exception as e: + print(f"❌ {file_path.name}: Unexpected error") + print(f" Error: {str(e)}") + failed += 1 + + if failed == 0: + print(f"✅ verse_commentary: Valid ({passed} files)") + return True + + print(f"❌ verse_commentary: {failed} files failed validation") + return False + + +def validate_study_guides_directory(verbose: bool = False) -> bool: + """Validate per-guide study guide files.""" + dir_path = DATA_DIR / "study_guides" + if not dir_path.exists(): + print(f"❌ study_guides: Directory not found at {dir_path}") + return False + + passed = 0 + failed = 0 + + for file_path in sorted(dir_path.glob("*.json")): + data, error = load_json(file_path) + if error: + print(f"❌ {file_path.name}: {error}") + failed += 1 + continue + + try: + StudyGuideFile(**data) + if verbose: + print(f"✅ {file_path.name}: Valid") + passed += 1 + except ValidationError as e: + print(f"❌ {file_path.name}: Validation failed") + for error_detail in e.errors(): + location = " -> ".join(str(loc) for loc in error_detail['loc']) + print(f" {location}: {error_detail['msg']}") + failed += 1 + except Exception as e: + print(f"❌ {file_path.name}: Unexpected error") + print(f" Error: {str(e)}") + failed += 1 + + if failed == 0: + print(f"✅ study_guides: Valid ({passed} files)") + return True + + print(f"❌ study_guides: {failed} files failed validation") + return False + + +def validate_topics_directory(verbose: bool = False) -> bool: + """Validate per-topic files.""" + dir_path = DATA_DIR / "topics" + if not dir_path.exists(): + print(f"❌ topics: Directory not found at {dir_path}") + return False + + passed = 0 + failed = 0 + + for file_path in sorted(dir_path.glob("*.json")): + data, error = load_json(file_path) + if error: + print(f"❌ {file_path.name}: {error}") + failed += 1 + continue + + try: + TopicsFile(root=data) + if verbose: + print(f"✅ {file_path.name}: Valid") + passed += 1 + except ValidationError as e: + print(f"❌ {file_path.name}: Validation failed") + for error_detail in e.errors(): + location = " -> ".join(str(loc) for loc in error_detail['loc']) + print(f" {location}: {error_detail['msg']}") + failed += 1 + except Exception as e: + print(f"❌ {file_path.name}: Unexpected error") + print(f" Error: {str(e)}") + failed += 1 + + if failed == 0: + print(f"✅ topics: Valid ({passed} files)") + return True + + print(f"❌ topics: {failed} files failed validation") + return False + + +def validate_reading_plans_directory(verbose: bool = False) -> bool: + """Validate per-plan reading plan files.""" + dir_path = DATA_DIR / "reading_plans" + if not dir_path.exists(): + print(f"❌ reading_plans: Directory not found at {dir_path}") + return False + + passed = 0 + failed = 0 + + for file_path in sorted(dir_path.glob("*.json")): + data, error = load_json(file_path) + if error: + print(f"❌ {file_path.name}: {error}") + failed += 1 + continue + + try: + # Each file has one key with the plan id mapping to the plan object + if len(data) != 1: + raise ValueError("Reading plan file must contain exactly one plan") + ReadingPlanFile(plan=data) + if verbose: + print(f"✅ {file_path.name}: Valid") + passed += 1 + except ValidationError as e: + print(f"❌ {file_path.name}: Validation failed") + for error_detail in e.errors(): + location = " -> ".join(str(loc) for loc in error_detail['loc']) + print(f" {location}: {error_detail['msg']}") + failed += 1 + except Exception as e: + print(f"❌ {file_path.name}: Unexpected error") + print(f" Error: {str(e)}") + failed += 1 + + if failed == 0: + print(f"✅ reading_plans: Valid ({passed} files)") + return True + + print(f"❌ reading_plans: {failed} files failed validation") + return False + + +def validate_all(verbose: bool = False) -> Tuple[int, int]: + """Validate all data files with models. Returns (passed, failed) counts.""" + passed = 0 + failed = 0 + + print("=" * 60) + print("Validating JSON data files with Pydantic models") + print("=" * 60) + print() + + for data_file in sorted(MODEL_MAPPING.keys()): + if validate_file(data_file, verbose): + passed += 1 + else: + failed += 1 + if verbose: + print() + + return passed, failed + + +def validate_book_file(book_file: Path, verbose: bool = False) -> bool: + """Validate a single book JSON file using BookIntroduction model.""" + # Load data file + data, error = load_json(book_file) + if error: + print(f"❌ {book_file.name}: {error}") + return False + + # Validate using Pydantic model + try: + BookIntroduction(**data) + print(f"✅ {book_file.name}: Valid") + if verbose: + print(f" Size: {book_file.stat().st_size:,} bytes") + return True + + except ValidationError as e: + print(f"❌ {book_file.name}: Validation failed") + for error_detail in e.errors(): + location = " -> ".join(str(loc) for loc in error_detail['loc']) + print(f" {location}: {error_detail['msg']}") + return False + + except Exception as e: + print(f"❌ {book_file.name}: Unexpected error") + print(f" Error: {str(e)}") + return False + + +def validate_all_books(verbose: bool = False) -> Tuple[int, int]: + """Validate all 66 book introduction files. Returns (passed, failed) counts.""" + passed = 0 + failed = 0 + + books_dir = DATA_DIR / "books" + if not books_dir.exists(): + print(f"❌ Books directory not found: {books_dir}") + return 0, 0 + + print("=" * 60) + print("Validating 66 book introduction files") + print("=" * 60) + print() + + book_files = sorted(books_dir.glob("*.json")) + for book_file in book_files: + if validate_book_file(book_file, verbose): + passed += 1 + else: + failed += 1 + if verbose: + print() + + return passed, failed + + +def generate_json_schemas(): + """Generate JSON Schema files from Pydantic models.""" + print("=" * 60) + print("Generating JSON Schema files from Pydantic models") + print("=" * 60) + print() + + SCHEMAS_DIR.mkdir(exist_ok=True) + + # Generate schemas for main data files + for data_file, model_class in MODEL_MAPPING.items(): + schema_file = data_file.replace('.json', '.schema.json') if data_file.endswith('.json') else f"{data_file}.schema.json" + schema_path = SCHEMAS_DIR / schema_file + + try: + # Generate JSON Schema from Pydantic model + schema = model_class.model_json_schema() + + # Add metadata + schema['$id'] = f"https://kjvstudy.org/schemas/{schema_file}" + schema['title'] = model_class.__doc__ or model_class.__name__ + + # Write schema file + with open(schema_path, 'w', encoding='utf-8') as f: + json.dump(schema, f, indent=2, ensure_ascii=False) + + print(f"✅ Generated {schema_file}") + + except Exception as e: + print(f"❌ Failed to generate {schema_file}: {e}") + + # Generate schema for book introduction files + try: + schema_file = "book_introduction.schema.json" + schema_path = SCHEMAS_DIR / schema_file + + schema = BookIntroduction.model_json_schema() + schema['$id'] = f"https://kjvstudy.org/schemas/{schema_file}" + schema['title'] = "Schema for individual book introduction files" + + with open(schema_path, 'w', encoding='utf-8') as f: + json.dump(schema, f, indent=2, ensure_ascii=False) + + print(f"✅ Generated {schema_file}") + + except Exception as e: + print(f"❌ Failed to generate {schema_file}: {e}") + + print() + print(f"Schemas written to: {SCHEMAS_DIR}") + + +def main(): + """Main entry point.""" + import argparse + + parser = argparse.ArgumentParser( + description="Validate JSON data files with Pydantic models", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + python scripts/validate_data.py # Validate all files + python scripts/validate_data.py -f bible_metadata.json # Validate one file + python scripts/validate_data.py --verbose # Show details + python scripts/validate_data.py --generate-schemas # Generate JSON schemas + """ + ) + parser.add_argument( + '-f', '--file', + help='Validate specific file only', + metavar='FILE' + ) + parser.add_argument( + '-v', '--verbose', + action='store_true', + help='Show detailed output' + ) + parser.add_argument( + '--generate-schemas', + action='store_true', + help='Generate JSON Schema files from Pydantic models' + ) + parser.add_argument( + '--books', + action='store_true', + help='Validate all 66 book introduction files' + ) + + args = parser.parse_args() + + # Generate schemas if requested + if args.generate_schemas: + generate_json_schemas() + sys.exit(0) + + # Validate books if requested + if args.books: + passed, failed = validate_all_books(args.verbose) + + print() + print("=" * 60) + print(f"Results: {passed} passed, {failed} failed") + print("=" * 60) + + sys.exit(0 if failed == 0 else 1) + + # Validate specific file or all files + if args.file: + success = validate_file(args.file, args.verbose) + sys.exit(0 if success else 1) + else: + passed, failed = validate_all(args.verbose) + + print() + print("=" * 60) + print(f"Results: {passed} passed, {failed} failed") + print("=" * 60) + + sys.exit(0 if failed == 0 else 1) + + +if __name__ == "__main__": + main()