kjvstudy.org/scripts/validate_data.py

#!/usr/bin/env python3
"""
Validate JSON data files using Pydantic models.

This script validates all data files in kjvstudy_org/data/ using Pydantic models
for type safety and validation. Pydantic provides better error messages and
integrates naturally with FastAPI.

Usage:
    python scripts/validate_data.py              # Validate all files
    python scripts/validate_data.py --file bible_metadata.json  # Validate specific file
    python scripts/validate_data.py --verbose    # Show detailed output
    python scripts/validate_data.py --generate-schemas  # Generate JSON schemas

Requirements:
    pip install pydantic (already installed with FastAPI)
"""

import json
import sys
from pathlib import Path
from typing import Dict, List, Tuple, Optional

try:
    from pydantic import BaseModel, RootModel, Field, field_validator, ValidationError
except ImportError:
    print("Error: pydantic package not found")
    print("Install with: pip install pydantic")
    sys.exit(1)

# Path to data directory
DATA_DIR = Path(__file__).parent.parent / "kjvstudy_org" / "data"
SCHEMAS_DIR = DATA_DIR / "schemas"


# ============================================================================
# Pydantic Models for Data Validation
# ============================================================================

class BibleMetadata(BaseModel):
    """Schema for bible_metadata.json"""
    old_testament_books: List[str] = Field(..., min_length=39, max_length=39)
    new_testament_books: List[str] = Field(..., min_length=27, max_length=27)
    book_abbreviations: Dict[str, str] = Field(..., min_length=1)

    @field_validator('old_testament_books', 'new_testament_books')
    @classmethod
    def check_unique_books(cls, v):
        if len(v) != len(set(v)):
            raise ValueError("Duplicate book names found")
        return v


class WordStudy(BaseModel):
    """Schema for individual word study entry"""
    ot_term: Optional[str] = Field(None, min_length=1)
    ot_transliteration: Optional[str] = Field(None, min_length=1)
    ot_meaning: Optional[str] = Field(None, min_length=1)
    ot_note: Optional[str] = Field(None, min_length=1)
    nt_term: Optional[str] = Field(None, min_length=1)
    nt_transliteration: Optional[str] = Field(None, min_length=1)
    nt_meaning: Optional[str] = Field(None, min_length=1)
    nt_note: Optional[str] = Field(None, min_length=1)


class WordStudies(RootModel[Dict[str, WordStudy]]):
    """Schema for word_studies.json"""
    root: Dict[str, WordStudy]


class CatalogEntry(BaseModel):
    """Schema for study guide catalog entry"""
    title: str = Field(..., min_length=1)
    description: str = Field(..., min_length=1)
    slug: str = Field(..., pattern=r'^[a-z-]+$')
    verses: List[str]

    @field_validator('verses')
    @classmethod
    def check_verse_format(cls, v):
        import re
        pattern = r'^[A-Za-z0-9 ]+ \d+:\d+(-\d+)?$'
        for verse in v:
            if not re.match(pattern, verse):
                raise ValueError(f"Invalid verse reference format: {verse}")
        return v


class StudySection(BaseModel):
    """Schema for study guide section"""
    title: str = Field(..., min_length=1)
    verses: List[str]
    content: str = Field(..., min_length=1)

    @field_validator('verses')
    @classmethod
    def check_verse_format(cls, v):
        import re
        pattern = r'^[A-Za-z0-9 ]+ \d+:\d+(-\d+)?$'
        for verse in v:
            if not re.match(pattern, verse):
                raise ValueError(f"Invalid verse reference format: {verse}")
        return v


class GuideContent(BaseModel):
    """Schema for study guide content"""
    title: str = Field(..., min_length=1)
    description: str = Field(..., min_length=1)
    sections: List[StudySection] = Field(..., min_length=1)


class StudyGuideFile(BaseModel):
    """Schema for a single study guide file"""
    content: GuideContent
    catalog_entry: Optional[CatalogEntry] = None
    category: Optional[str] = None


class TopicsFile(BaseModel):
    """Schema for a single topics file"""
    root: Dict[str, dict]


class ReadingPlanFile(BaseModel):
    """Schema for a single reading plan file"""
    plan: Dict[str, List[Dict[str, object]]]


class VerseCommentaryEntry(BaseModel):
    """Schema for verse commentary entry"""
    analysis: str = Field(..., min_length=1)
    historical: str = Field(..., min_length=1)
    questions: List[str] = Field(..., min_length=1)


class VerseCommentaryBook(BaseModel):
    """Schema for a single verse commentary book file"""
    book: str = Field(..., min_length=1)
    commentary: Dict[str, Dict[str, VerseCommentaryEntry]] = Field(..., min_length=1)

    @field_validator('commentary')
    @classmethod
    def check_numeric_keys(cls, v):
        for chapter_key, verses in v.items():
            if not str(chapter_key).isdigit():
                raise ValueError(f"Invalid chapter key: {chapter_key}")
            if not isinstance(verses, dict) or len(verses) == 0:
                raise ValueError(f"Chapter {chapter_key} must contain verse entries")
            for verse_key, entry in verses.items():
                if not str(verse_key).isdigit():
                    raise ValueError(f"Invalid verse key: {verse_key}")
                if not isinstance(entry, (dict, BaseModel)):
                    raise ValueError(f"Verse {chapter_key}:{verse_key} must be an object")
        return v


class FeaturedVerse(BaseModel):
    """Schema for individual featured verse"""
    book: str = Field(..., min_length=1)
    chapter: int = Field(..., ge=1)
    verse: int = Field(..., ge=1)


class FeaturedVerses(BaseModel):
    """Schema for featured_verses.json"""
    verses: List[FeaturedVerse] = Field(..., min_length=1)


class RedLetterVerses(BaseModel):
    """Schema for red_letter_verses.json"""
    description: str = Field(..., min_length=1)
    note: str = Field(..., min_length=1)
    verses: Dict[str, str] = Field(..., min_length=1)

    @field_validator('verses')
    @classmethod
    def check_verses(cls, v):
        import re
        # Validate verse reference format
        pattern = r"^[A-Za-z0-9 ']+ \d+:\d+$"
        for key, value in v.items():
            if not re.match(pattern, key):
                raise ValueError(f"Invalid verse reference key: {key}")
            # Value must be either "full" or a non-empty string
            if value != "full" and (not isinstance(value, str) or len(value) == 0):
                raise ValueError(f"Invalid value for {key}: must be 'full' or a non-empty string")
        return v


class ResourceSlugs(BaseModel):
    """Schema for resource_slugs.json"""
    study_guides: List[str]
    angels: List[str]
    prophets: List[str]
    names_of_god: List[str]
    parables: List[str]
    covenants: List[str]
    apostles: List[str]
    women: List[str]
    festivals: List[str]
    fruits_of_spirit: List[str]

    @field_validator('*')
    @classmethod
    def check_slugs(cls, v):
        # Check for duplicates
        if len(v) != len(set(v)):
            raise ValueError("Duplicate slugs found")
        # Check slug format
        import re
        pattern = r'^[a-z-]+$'
        for slug in v:
            if not re.match(pattern, slug):
                raise ValueError(f"Invalid slug format: {slug}")
        return v


class OutlineSection(BaseModel):
    """Schema for book outline section"""
    section: str = Field(..., min_length=1)
    chapters: str = Field(..., min_length=1)
    description: str = Field(..., min_length=1)


class KeyTheme(BaseModel):
    """Schema for book key theme"""
    theme: str = Field(..., min_length=1)
    description: str = Field(..., min_length=1)


class KeyVerse(BaseModel):
    """Schema for book key verse"""
    reference: str = Field(..., min_length=1)
    text: str = Field(..., min_length=1)


class BookIntroduction(BaseModel):
    """Schema for individual book introduction file"""
    name: str = Field(..., min_length=1)
    abbreviation: str = Field(..., min_length=1)
    testament: str = Field(..., pattern=r'^(Old Testament|New Testament)$')
    position: int = Field(..., ge=1, le=66)
    chapters: int = Field(..., ge=1)
    category: str = Field(..., min_length=1)
    author: str = Field(..., min_length=1)
    date_written: str = Field(..., min_length=1)
    introduction: str = Field(..., min_length=1)
    outline: List[OutlineSection] = Field(..., min_length=1)
    key_themes: List[KeyTheme] = Field(..., min_length=1)
    key_verses: List[KeyVerse] = Field(..., min_length=1)
    christ_in_book: Optional[str] = None


# ============================================================================
# Validation Logic
# ============================================================================

# Mapping of data files to their Pydantic models
MODEL_MAPPING = {
    "bible_metadata.json": BibleMetadata,
    "word_studies.json": WordStudies,
    "study_guides": StudyGuideFile,
    "verse_commentary": VerseCommentaryBook,
    "topics": TopicsFile,
    "reading_plans": ReadingPlanFile,
    "featured_verses.json": FeaturedVerses,
    "red_letter_verses.json": RedLetterVerses,
    "resource_slugs.json": ResourceSlugs,
}


def load_json(file_path: Path) -> Tuple[dict, Optional[str]]:
    """Load JSON file and return data and error message if any."""
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            return json.load(f), None
    except json.JSONDecodeError as e:
        return None, f"JSON syntax error: {e}"
    except Exception as e:
        return None, f"Error loading file: {e}"


def validate_file(data_file: str, verbose: bool = False) -> bool:
    """Validate a single data file using its Pydantic model."""
    if data_file == "verse_commentary":
        return validate_verse_commentary_directory(verbose)
    if data_file == "study_guides":
        return validate_study_guides_directory(verbose)
    if data_file == "topics":
        return validate_topics_directory(verbose)
    if data_file == "reading_plans":
        return validate_reading_plans_directory(verbose)

    if data_file not in MODEL_MAPPING:
        if verbose:
            print(f"⚠️  {data_file}: No validation model defined (skipped)")
        return True

    model_class = MODEL_MAPPING[data_file]
    data_path = DATA_DIR / data_file

    # Check if file exists
    if not data_path.exists():
        print(f"❌ {data_file}: File not found at {data_path}")
        return False

    # Load data file
    data, error = load_json(data_path)
    if error:
        print(f"❌ {data_file}: {error}")
        return False

    # Validate using Pydantic model
    try:
        # For RootModel subclasses, pass data directly to constructor
        # For regular BaseModel subclasses, unpack as kwargs
        if issubclass(model_class, RootModel):
            model_class(data)
        else:
            model_class(**data)

        print(f"✅ {data_file}: Valid")
        if verbose:
            print(f"   Model: {model_class.__name__}")
            print(f"   Size: {data_path.stat().st_size:,} bytes")
        return True

    except ValidationError as e:
        print(f"❌ {data_file}: Validation failed")
        for error_detail in e.errors():
            location = " -> ".join(str(loc) for loc in error_detail['loc'])
            print(f"   {location}: {error_detail['msg']}")
            if verbose and 'ctx' in error_detail:
                print(f"   Context: {error_detail['ctx']}")
        return False

    except Exception as e:
        print(f"❌ {data_file}: Unexpected error")
        print(f"   Error: {str(e)}")
        return False


def validate_verse_commentary_directory(verbose: bool = False) -> bool:
    """Validate all per-book verse commentary files."""
    dir_path = DATA_DIR / "verse_commentary"
    if not dir_path.exists():
        print(f"❌ verse_commentary: Directory not found at {dir_path}")
        return False

    passed = 0
    failed = 0

    for file_path in sorted(dir_path.glob("*.json")):
        data, error = load_json(file_path)
        if error:
            print(f"❌ {file_path.name}: {error}")
            failed += 1
            continue

        try:
            VerseCommentaryBook(**data)
            if verbose:
                print(f"✅ {file_path.name}: Valid")
            passed += 1
        except ValidationError as e:
            print(f"❌ {file_path.name}: Validation failed")
            for error_detail in e.errors():
                location = " -> ".join(str(loc) for loc in error_detail['loc'])
                print(f"   {location}: {error_detail['msg']}")
            failed += 1
        except Exception as e:
            print(f"❌ {file_path.name}: Unexpected error")
            print(f"   Error: {str(e)}")
            failed += 1

    if failed == 0:
        print(f"✅ verse_commentary: Valid ({passed} files)")
        return True

    print(f"❌ verse_commentary: {failed} files failed validation")
    return False


def validate_study_guides_directory(verbose: bool = False) -> bool:
    """Validate per-guide study guide files."""
    dir_path = DATA_DIR / "study_guides"
    if not dir_path.exists():
        print(f"❌ study_guides: Directory not found at {dir_path}")
        return False

    passed = 0
    failed = 0

    for file_path in sorted(dir_path.glob("*.json")):
        data, error = load_json(file_path)
        if error:
            print(f"❌ {file_path.name}: {error}")
            failed += 1
            continue

        try:
            StudyGuideFile(**data)
            if verbose:
                print(f"✅ {file_path.name}: Valid")
            passed += 1
        except ValidationError as e:
            print(f"❌ {file_path.name}: Validation failed")
            for error_detail in e.errors():
                location = " -> ".join(str(loc) for loc in error_detail['loc'])
                print(f"   {location}: {error_detail['msg']}")
            failed += 1
        except Exception as e:
            print(f"❌ {file_path.name}: Unexpected error")
            print(f"   Error: {str(e)}")
            failed += 1

    if failed == 0:
        print(f"✅ study_guides: Valid ({passed} files)")
        return True

    print(f"❌ study_guides: {failed} files failed validation")
    return False


def validate_topics_directory(verbose: bool = False) -> bool:
    """Validate per-topic files."""
    dir_path = DATA_DIR / "topics"
    if not dir_path.exists():
        print(f"❌ topics: Directory not found at {dir_path}")
        return False

    passed = 0
    failed = 0

    for file_path in sorted(dir_path.glob("*.json")):
        data, error = load_json(file_path)
        if error:
            print(f"❌ {file_path.name}: {error}")
            failed += 1
            continue

        try:
            TopicsFile(root=data)
            if verbose:
                print(f"✅ {file_path.name}: Valid")
            passed += 1
        except ValidationError as e:
            print(f"❌ {file_path.name}: Validation failed")
            for error_detail in e.errors():
                location = " -> ".join(str(loc) for loc in error_detail['loc'])
                print(f"   {location}: {error_detail['msg']}")
            failed += 1
        except Exception as e:
            print(f"❌ {file_path.name}: Unexpected error")
            print(f"   Error: {str(e)}")
            failed += 1

    if failed == 0:
        print(f"✅ topics: Valid ({passed} files)")
        return True

    print(f"❌ topics: {failed} files failed validation")
    return False


def validate_reading_plans_directory(verbose: bool = False) -> bool:
    """Validate per-plan reading plan files."""
    dir_path = DATA_DIR / "reading_plans"
    if not dir_path.exists():
        print(f"❌ reading_plans: Directory not found at {dir_path}")
        return False

    passed = 0
    failed = 0

    for file_path in sorted(dir_path.glob("*.json")):
        data, error = load_json(file_path)
        if error:
            print(f"❌ {file_path.name}: {error}")
            failed += 1
            continue

        try:
            # Each file has one key with the plan id mapping to the plan object
            if len(data) != 1:
                raise ValueError("Reading plan file must contain exactly one plan")
            ReadingPlanFile(plan=data)
            if verbose:
                print(f"✅ {file_path.name}: Valid")
            passed += 1
        except ValidationError as e:
            print(f"❌ {file_path.name}: Validation failed")
            for error_detail in e.errors():
                location = " -> ".join(str(loc) for loc in error_detail['loc'])
                print(f"   {location}: {error_detail['msg']}")
            failed += 1
        except Exception as e:
            print(f"❌ {file_path.name}: Unexpected error")
            print(f"   Error: {str(e)}")
            failed += 1

    if failed == 0:
        print(f"✅ reading_plans: Valid ({passed} files)")
        return True

    print(f"❌ reading_plans: {failed} files failed validation")
    return False


def validate_all(verbose: bool = False) -> Tuple[int, int]:
    """Validate all data files with models. Returns (passed, failed) counts."""
    passed = 0
    failed = 0

    print("=" * 60)
    print("Validating JSON data files with Pydantic models")
    print("=" * 60)
    print()

    for data_file in sorted(MODEL_MAPPING.keys()):
        if validate_file(data_file, verbose):
            passed += 1
        else:
            failed += 1
        if verbose:
            print()

    return passed, failed


def validate_book_file(book_file: Path, verbose: bool = False) -> bool:
    """Validate a single book JSON file using BookIntroduction model."""
    # Load data file
    data, error = load_json(book_file)
    if error:
        print(f"❌ {book_file.name}: {error}")
        return False

    # Validate using Pydantic model
    try:
        BookIntroduction(**data)
        print(f"✅ {book_file.name}: Valid")
        if verbose:
            print(f"   Size: {book_file.stat().st_size:,} bytes")
        return True

    except ValidationError as e:
        print(f"❌ {book_file.name}: Validation failed")
        for error_detail in e.errors():
            location = " -> ".join(str(loc) for loc in error_detail['loc'])
            print(f"   {location}: {error_detail['msg']}")
        return False

    except Exception as e:
        print(f"❌ {book_file.name}: Unexpected error")
        print(f"   Error: {str(e)}")
        return False


def validate_all_books(verbose: bool = False) -> Tuple[int, int]:
    """Validate all 66 book introduction files. Returns (passed, failed) counts."""
    passed = 0
    failed = 0

    books_dir = DATA_DIR / "books"
    if not books_dir.exists():
        print(f"❌ Books directory not found: {books_dir}")
        return 0, 0

    print("=" * 60)
    print("Validating 66 book introduction files")
    print("=" * 60)
    print()

    book_files = sorted(books_dir.glob("*.json"))
    for book_file in book_files:
        if validate_book_file(book_file, verbose):
            passed += 1
        else:
            failed += 1
        if verbose:
            print()

    return passed, failed


def generate_json_schemas():
    """Generate JSON Schema files from Pydantic models."""
    print("=" * 60)
    print("Generating JSON Schema files from Pydantic models")
    print("=" * 60)
    print()

    SCHEMAS_DIR.mkdir(exist_ok=True)

    # Generate schemas for main data files
    for data_file, model_class in MODEL_MAPPING.items():
        schema_file = data_file.replace('.json', '.schema.json') if data_file.endswith('.json') else f"{data_file}.schema.json"
        schema_path = SCHEMAS_DIR / schema_file

        try:
            # Generate JSON Schema from Pydantic model
            schema = model_class.model_json_schema()

            # Add metadata
            schema['$id'] = f"https://kjvstudy.org/schemas/{schema_file}"
            schema['title'] = model_class.__doc__ or model_class.__name__

            # Write schema file
            with open(schema_path, 'w', encoding='utf-8') as f:
                json.dump(schema, f, indent=2, ensure_ascii=False)

            print(f"✅ Generated {schema_file}")

        except Exception as e:
            print(f"❌ Failed to generate {schema_file}: {e}")

    # Generate schema for book introduction files
    try:
        schema_file = "book_introduction.schema.json"
        schema_path = SCHEMAS_DIR / schema_file

        schema = BookIntroduction.model_json_schema()
        schema['$id'] = f"https://kjvstudy.org/schemas/{schema_file}"
        schema['title'] = "Schema for individual book introduction files"

        with open(schema_path, 'w', encoding='utf-8') as f:
            json.dump(schema, f, indent=2, ensure_ascii=False)

        print(f"✅ Generated {schema_file}")

    except Exception as e:
        print(f"❌ Failed to generate {schema_file}: {e}")

    print()
    print(f"Schemas written to: {SCHEMAS_DIR}")


def main():
    """Main entry point."""
    import argparse

    parser = argparse.ArgumentParser(
        description="Validate JSON data files with Pydantic models",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  python scripts/validate_data.py                    # Validate all files
  python scripts/validate_data.py -f bible_metadata.json  # Validate one file
  python scripts/validate_data.py --verbose          # Show details
  python scripts/validate_data.py --generate-schemas # Generate JSON schemas
        """
    )
    parser.add_argument(
        '-f', '--file',
        help='Validate specific file only',
        metavar='FILE'
    )
    parser.add_argument(
        '-v', '--verbose',
        action='store_true',
        help='Show detailed output'
    )
    parser.add_argument(
        '--generate-schemas',
        action='store_true',
        help='Generate JSON Schema files from Pydantic models'
    )
    parser.add_argument(
        '--books',
        action='store_true',
        help='Validate all 66 book introduction files'
    )

    args = parser.parse_args()

    # Generate schemas if requested
    if args.generate_schemas:
        generate_json_schemas()
        sys.exit(0)

    # Validate books if requested
    if args.books:
        passed, failed = validate_all_books(args.verbose)

        print()
        print("=" * 60)
        print(f"Results: {passed} passed, {failed} failed")
        print("=" * 60)

        sys.exit(0 if failed == 0 else 1)

    # Validate specific file or all files
    if args.file:
        success = validate_file(args.file, args.verbose)
        sys.exit(0 if success else 1)
    else:
        passed, failed = validate_all(args.verbose)

        print()
        print("=" * 60)
        print(f"Results: {passed} passed, {failed} failed")
        print("=" * 60)

        sys.exit(0 if failed == 0 else 1)


if __name__ == "__main__":
    main()