Files
kjvstudy.org/scripts/analyze_data_coverage.py
T
2025-11-30 21:26:59 -05:00

192 lines
5.6 KiB
Python

#!/usr/bin/env python3
"""
Analyze data coverage to identify gaps and areas for improvement.
"""
import json
import sys
from pathlib import Path
from collections import defaultdict
# Add parent directory to path
sys.path.insert(0, str(Path(__file__).parent.parent))
from kjvstudy_org.kjv import bible
from kjvstudy_org.utils.commentary_loader import load_commentary_flat
def analyze_commentary_coverage():
"""Analyze verse commentary coverage."""
print("=" * 80)
print("VERSE COMMENTARY COVERAGE ANALYSIS")
print("=" * 80)
print()
commentary = load_commentary_flat()
total_verses = 31102
verses_with_commentary = len(commentary)
coverage_pct = (verses_with_commentary / total_verses) * 100
print(f"Total verses in Bible: {total_verses:,}")
print(f"Verses with commentary: {verses_with_commentary:,}")
print(f"Coverage: {coverage_pct:.1f}%")
print()
# Analyze by book
by_book = defaultdict(int)
for verse_ref in commentary.keys():
book = verse_ref.split(':')[0] if ':' in verse_ref else verse_ref.split(' ')[0]
by_book[book] += 1
print(f"Books with commentary ({len(by_book)} books):")
for book, count in sorted(by_book.items(), key=lambda x: x[1], reverse=True):
print(f" {book}: {count} verses")
print()
return verses_with_commentary
def analyze_cross_reference_coverage():
"""Analyze cross-reference coverage."""
print("=" * 80)
print("CROSS-REFERENCE COVERAGE ANALYSIS")
print("=" * 80)
print()
xref_file = Path("kjvstudy_org/data/cross_references.json")
with open(xref_file, 'r') as f:
cross_refs = json.load(f)
total_verses = 31102
verses_with_xrefs = len(cross_refs)
total_xrefs = sum(len(refs) for refs in cross_refs.values())
avg_xrefs_per_verse = total_xrefs / verses_with_xrefs if verses_with_xrefs > 0 else 0
# Count how many have descriptions
xrefs_with_descriptions = 0
xrefs_without_descriptions = 0
for refs in cross_refs.values():
for ref in refs:
if ref.get('note') and ref['note'].strip():
xrefs_with_descriptions += 1
else:
xrefs_without_descriptions += 1
coverage_pct = (verses_with_xrefs / total_verses) * 100
description_pct = (xrefs_with_descriptions / total_xrefs) * 100 if total_xrefs > 0 else 0
print(f"Total verses in Bible: {total_verses:,}")
print(f"Verses with cross-references: {verses_with_xrefs:,}")
print(f"Coverage: {coverage_pct:.1f}%")
print()
print(f"Total cross-reference entries: {total_xrefs:,}")
print(f"Average per verse: {avg_xrefs_per_verse:.1f}")
print()
print(f"Cross-refs with descriptions: {xrefs_with_descriptions:,} ({description_pct:.1f}%)")
print(f"Cross-refs without descriptions: {xrefs_without_descriptions:,}")
print()
def analyze_word_studies():
"""Analyze word studies coverage."""
print("=" * 80)
print("WORD STUDIES ANALYSIS")
print("=" * 80)
print()
word_studies_file = Path("kjvstudy_org/data/word_studies.json")
with open(word_studies_file, 'r') as f:
word_studies = json.load(f)
total_studies = len(word_studies)
# Categorize by OT/NT coverage
has_ot = 0
has_nt = 0
has_both = 0
for word, data in word_studies.items():
ot = bool(data.get('ot_term'))
nt = bool(data.get('nt_term'))
if ot and nt:
has_both += 1
elif ot:
has_ot += 1
elif nt:
has_nt += 1
print(f"Total word studies: {total_studies}")
print(f" Both OT and NT: {has_both}")
print(f" OT only: {has_ot}")
print(f" NT only: {has_nt}")
print()
# Show sample words
print("Sample words studied:")
for word in list(word_studies.keys())[:10]:
print(f" - {word}")
print()
def analyze_books_without_data():
"""Identify books with minimal study material."""
print("=" * 80)
print("BOOKS WITH MINIMAL STUDY MATERIAL")
print("=" * 80)
print()
books = bible.get_books()
# Load commentary
commentary = load_commentary_flat()
# Count verses per book
book_stats = {}
for book in books:
chapters = bible.get_chapters_for_book(book)
total_verses = sum(len(bible.get_verses_by_book_chapter(book, ch)) for ch in chapters)
# Count commentary for this book
commentary_count = sum(1 for ref in commentary.keys() if ref.startswith(book + ':') or ref.startswith(book + ' '))
book_stats[book] = {
'total_verses': total_verses,
'commentary': commentary_count,
'commentary_pct': (commentary_count / total_verses * 100) if total_verses > 0 else 0
}
# Sort by lowest commentary percentage
sorted_books = sorted(book_stats.items(), key=lambda x: x[1]['commentary_pct'])
print("Books with lowest commentary coverage:")
print()
for book, stats in sorted_books[:15]:
print(f"{book:20} {stats['total_verses']:4} verses, {stats['commentary']:4} commentary ({stats['commentary_pct']:.1f}%)")
print()
def main():
"""Run all analyses."""
analyze_commentary_coverage()
analyze_cross_reference_coverage()
analyze_word_studies()
analyze_books_without_data()
print("=" * 80)
print("RECOMMENDATIONS FOR IMPROVEMENT")
print("=" * 80)
print()
print("1. Expand verse commentary to more books (currently 6-7% coverage)")
print("2. Add more word studies (currently 53 terms)")
print("3. Focus commentary on underserved books")
print("4. Consider adding thematic commentary (not just verse-by-verse)")
print()
if __name__ == "__main__":
main()