Files
kjvstudy.org/kjvstudy_org/utils/search_index.py
T
kennethreitz d4c364eb05 Optimize backend performance with caching and thread safety
Pre-process verse text cleaning once at initialization (5-10x speedup for iteration), fix SQLite connection thread safety for concurrent requests, and add LRU caching to frequently-called functions.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-27 17:33:53 -05:00

254 lines
8.1 KiB
Python

"""
SQLite FTS5 search index for fast Bible verse search.
This module provides a full-text search index using SQLite's FTS5 extension,
enabling efficient searches across all 31,102 Bible verses.
"""
import sqlite3
from pathlib import Path
from typing import List, Dict, Optional
from contextlib import contextmanager
from ..kjv import bible
# Database location - store in static directory alongside other data
DB_PATH = Path(__file__).parent.parent / "static" / "search_index.db"
@contextmanager
def get_connection():
"""
Get a database connection with proper cleanup and thread safety.
Creates a new connection for each request instead of using a global connection.
This is thread-safe and works well with FastAPI's concurrent request handling.
SQLite connection creation is very fast (~microseconds), so this is efficient.
"""
conn = sqlite3.connect(str(DB_PATH), check_same_thread=True)
conn.row_factory = sqlite3.Row
try:
yield conn
finally:
conn.close()
def init_search_index(force_rebuild: bool = False) -> bool:
"""
Initialize the FTS5 search index.
Creates the database and populates it with all Bible verses if it doesn't exist.
Returns True if the index was created/rebuilt, False if it already existed.
"""
if DB_PATH.exists() and not force_rebuild:
return False
# Ensure directory exists
DB_PATH.parent.mkdir(parents=True, exist_ok=True)
# Remove old database if rebuilding
if DB_PATH.exists():
DB_PATH.unlink()
with get_connection() as conn:
cursor = conn.cursor()
# Create FTS5 virtual table for full-text search
cursor.execute("""
CREATE VIRTUAL TABLE IF NOT EXISTS verses_fts USING fts5(
book,
chapter,
verse,
text,
reference,
tokenize='porter unicode61'
)
""")
# Create regular table for metadata and fast lookups
cursor.execute("""
CREATE TABLE IF NOT EXISTS verses (
id INTEGER PRIMARY KEY,
book TEXT NOT NULL,
chapter INTEGER NOT NULL,
verse INTEGER NOT NULL,
text TEXT NOT NULL,
reference TEXT NOT NULL,
UNIQUE(book, chapter, verse)
)
""")
# Create indexes for common queries
cursor.execute("CREATE INDEX IF NOT EXISTS idx_book ON verses(book)")
cursor.execute("CREATE INDEX IF NOT EXISTS idx_book_chapter ON verses(book, chapter)")
# Populate with all verses
print("Building search index...")
batch = []
batch_size = 1000
total = 0
for verse in bible.iter_verses():
reference = f"{verse.book} {verse.chapter}:{verse.verse}"
row = (verse.book, verse.chapter, verse.verse, verse.text, reference)
batch.append(row)
total += 1
if len(batch) >= batch_size:
cursor.executemany(
"INSERT INTO verses (book, chapter, verse, text, reference) VALUES (?, ?, ?, ?, ?)",
batch
)
cursor.executemany(
"INSERT INTO verses_fts (book, chapter, verse, text, reference) VALUES (?, ?, ?, ?, ?)",
batch
)
batch = []
# Insert remaining
if batch:
cursor.executemany(
"INSERT INTO verses (book, chapter, verse, text, reference) VALUES (?, ?, ?, ?, ?)",
batch
)
cursor.executemany(
"INSERT INTO verses_fts (book, chapter, verse, text, reference) VALUES (?, ?, ?, ?, ?)",
batch
)
conn.commit()
print(f"Search index built with {total} verses")
return True
def search_verses(
query: str,
limit: Optional[int] = None,
book_filter: Optional[str] = None,
testament_filter: Optional[str] = None
) -> List[Dict]:
"""
Search for verses matching the query using FTS5.
Args:
query: Search terms (supports FTS5 query syntax)
limit: Maximum number of results
book_filter: Filter to specific book
testament_filter: Filter to "old" or "new" testament
Returns:
List of matching verses with relevance scores
"""
# Ensure index exists
if not DB_PATH.exists():
init_search_index()
with get_connection() as conn:
cursor = conn.cursor()
# Build the FTS5 query
# Escape special FTS5 characters and prepare search terms
search_terms = query.strip()
if not search_terms:
return []
# For simple queries, search for all terms
# FTS5 will handle ranking by relevance
fts_query = ' '.join(f'"{term}"' for term in search_terms.split())
# Build SQL with optional filters
sql = """
SELECT
book,
chapter,
verse,
text,
reference,
bm25(verses_fts) as score
FROM verses_fts
WHERE verses_fts MATCH ?
"""
params = [fts_query]
if book_filter:
sql += " AND book = ?"
params.append(book_filter)
if testament_filter:
ot_books = [
'Genesis', 'Exodus', 'Leviticus', 'Numbers', 'Deuteronomy',
'Joshua', 'Judges', 'Ruth', '1 Samuel', '2 Samuel',
'1 Kings', '2 Kings', '1 Chronicles', '2 Chronicles',
'Ezra', 'Nehemiah', 'Esther', 'Job', 'Psalms', 'Proverbs',
'Ecclesiastes', 'Song of Solomon', 'Isaiah', 'Jeremiah',
'Lamentations', 'Ezekiel', 'Daniel', 'Hosea', 'Joel', 'Amos',
'Obadiah', 'Jonah', 'Micah', 'Nahum', 'Habakkuk', 'Zephaniah',
'Haggai', 'Zechariah', 'Malachi'
]
if testament_filter.lower() == 'old':
placeholders = ','.join('?' * len(ot_books))
sql += f" AND book IN ({placeholders})"
params.extend(ot_books)
elif testament_filter.lower() == 'new':
placeholders = ','.join('?' * len(ot_books))
sql += f" AND book NOT IN ({placeholders})"
params.extend(ot_books)
# Order by relevance (bm25 score - lower is better in SQLite)
sql += " ORDER BY score"
if limit:
sql += " LIMIT ?"
params.append(limit)
cursor.execute(sql, params)
results = []
for row in cursor.fetchall():
results.append({
"book": row["book"],
"chapter": row["chapter"],
"verse": row["verse"],
"text": row["text"],
"reference": row["reference"],
"url": f"/book/{row['book']}/chapter/{row['chapter']}#verse-{row['verse']}",
"score": abs(row["score"]), # BM25 returns negative, we want positive
"highlighted_text": highlight_matches(row["text"], query)
})
return results
def highlight_matches(text: str, query: str) -> str:
"""Highlight matching terms in text."""
highlighted = text
for term in query.lower().split():
# Case-insensitive replacement with highlighting
import re
pattern = re.compile(re.escape(term), re.IGNORECASE)
highlighted = pattern.sub(f'<mark>{term}</mark>', highlighted)
return highlighted
def get_search_stats() -> Dict:
"""Get statistics about the search index."""
if not DB_PATH.exists():
return {"indexed": False, "verses": 0}
with get_connection() as conn:
cursor = conn.cursor()
cursor.execute("SELECT COUNT(*) FROM verses")
count = cursor.fetchone()[0]
return {
"indexed": True,
"verses": count,
"db_size_mb": round(DB_PATH.stat().st_size / (1024 * 1024), 2)
}
# Initialize on import if database doesn't exist
if not DB_PATH.exists():
# Don't auto-init during import - call init_search_index() explicitly
pass