diff --git a/Dockerfile b/Dockerfile index fcb22ea..2f18af8 100644 --- a/Dockerfile +++ b/Dockerfile @@ -49,5 +49,9 @@ COPY . . # Build search index at image build time for fast searches RUN python3 -c "from kjvstudy_org.utils.search_index import init_search_index; init_search_index()" -# Run uvicorn directly (no nginx sidecar) -CMD ["sh", "-c", "uv run uvicorn kjvstudy_org.server:app --host ${HOST:-0.0.0.0} --port ${PORT:-8000} --workers ${WORKERS:-1} --proxy-headers"] +# Run with gunicorn + uvicorn workers for production resilience: +# --max-requests: recycle workers after N requests (prevents memory leaks) +# --max-requests-jitter: stagger recycling so workers don't all restart at once +# --timeout: kill workers that hang for >60s +# --graceful-timeout: give workers 10s to finish after SIGTERM +CMD ["sh", "-c", "uv run gunicorn kjvstudy_org.server:app --worker-class uvicorn.workers.UvicornWorker --bind ${HOST:-0.0.0.0}:${PORT:-8000} --workers ${WORKERS:-2} --max-requests 2000 --max-requests-jitter 500 --timeout 60 --graceful-timeout 10 --proxy-protocol --forwarded-allow-ips='*' --access-logfile -"] diff --git a/Dockerfile.static b/Dockerfile.static new file mode 100644 index 0000000..b81e306 --- /dev/null +++ b/Dockerfile.static @@ -0,0 +1,89 @@ +# ============================================================================= +# Stage 1: Builder — install deps, generate static HTML pages +# ============================================================================= +FROM python:3.13 AS builder + +COPY --from=ghcr.io/astral-sh/uv:latest /uv /bin/uv + +ENV PYTHONUNBUFFERED=1 \ + PYTHONDONTWRITEBYTECODE=1 \ + UV_COMPILE_BYTECODE=1 \ + UV_LINK_MODE=copy + +WORKDIR /app + +COPY pyproject.toml uv.lock ./ +RUN uv sync --frozen --no-install-project --no-dev + +COPY . . + +# Build search index (needed by app startup) +RUN uv run python3 -c "from kjvstudy_org.utils.search_index import init_search_index; init_search_index()" + +# Generate static HTML pages (~50K files, no PDFs or API JSON) +RUN uv run python scripts/generate_static_site.py --output /app/dist --workers 4 + +# ============================================================================= +# Stage 2: Runtime — nginx for static files + FastAPI sidecar for dynamic routes +# ============================================================================= +FROM python:3.13-slim + +# Install nginx + runtime deps for WeasyPrint (PDF generation in sidecar) +RUN apt-get update && apt-get install -y --no-install-recommends \ + nginx \ + curl \ + libpango-1.0-0 \ + libharfbuzz0b \ + libpangoft2-1.0-0 \ + libffi8 \ + libgdk-pixbuf-2.0-0 \ + shared-mime-info \ + fonts-dejavu-core \ + && rm -rf /var/lib/apt/lists/* + +COPY --from=ghcr.io/astral-sh/uv:latest /uv /bin/uv + +ENV PYTHONUNBUFFERED=1 \ + PYTHONDONTWRITEBYTECODE=1 \ + PYTHONPATH="/app" \ + PATH="/app/.venv/bin:$PATH" + +WORKDIR /app + +# Copy virtualenv from builder +COPY --from=builder /app/.venv /app/.venv + +# Copy application code (needed by the sidecar) +COPY --from=builder /app/kjvstudy_org /app/kjvstudy_org +COPY --from=builder /app/scripts/search_api.py /app/scripts/search_api.py + +# Copy pre-rendered static site +COPY --from=builder /app/dist /app/dist + +# Copy nginx config +COPY nginx.conf /etc/nginx/nginx.conf + +# Entrypoint: start FastAPI sidecar + nginx +COPY <<'ENTRY' /app/start.sh +#!/bin/sh +set -e + +# Start the FastAPI sidecar in the background +# It handles: search, API, PDFs, OG images, and any uncached pages +python3 /app/scripts/search_api.py & +SIDECAR_PID=$! + +# Wait briefly for sidecar to be ready +sleep 1 + +# Start nginx in the foreground +exec nginx -g 'daemon off;' +ENTRY +RUN chmod +x /app/start.sh + +EXPOSE 8000 + +HEALTHCHECK --interval=15s --timeout=5s --start-period=10s \ + CMD curl -f http://localhost:8000/health || exit 1 + +CMD ["/app/start.sh"] diff --git a/fly.toml b/fly.toml index 23ce4f3..113f96c 100644 --- a/fly.toml +++ b/fly.toml @@ -10,6 +10,7 @@ primary_region = 'iad' strategy = "bluegreen" [build] +dockerfile = "Dockerfile.static" [http_service] internal_port = 8000 @@ -45,5 +46,5 @@ PYTHONDONTWRITEBYTECODE = "1" # Lazy-load interlinear data to reduce memory usage PRELOAD_INTERLINEAR = "false" -# Number of Uvicorn workers -WORKERS = "2" +# Sidecar workers (gunicorn) +SIDECAR_WORKERS = "1" diff --git a/kjvstudy_org/server.py b/kjvstudy_org/server.py index 71fb150..f4c35bb 100644 --- a/kjvstudy_org/server.py +++ b/kjvstudy_org/server.py @@ -3,6 +3,7 @@ import json import os import re import random +import time from contextlib import asynccontextmanager from datetime import datetime, timedelta from pathlib import Path as PathLib @@ -236,6 +237,70 @@ class BotLoggerMiddleware(BaseHTTPMiddleware): return response +# Rate limiting middleware — per-IP request throttle +class RateLimitMiddleware(BaseHTTPMiddleware): + """Simple in-memory per-IP rate limiter using a sliding window.""" + + def __init__(self, app, requests_per_second: float = 10.0): + super().__init__(app) + self.rate = requests_per_second + # {ip: (token_count, last_refill_time)} + self._buckets: dict[str, tuple[float, float]] = {} + self._max_tokens = requests_per_second * 5 # burst allowance + + async def dispatch(self, request: Request, call_next): + # Skip rate limiting for health checks + if request.url.path == "/health": + return await call_next(request) + + ip = request.client.host if request.client else "unknown" + now = time.monotonic() + + tokens, last = self._buckets.get(ip, (self._max_tokens, now)) + elapsed = now - last + tokens = min(self._max_tokens, tokens + elapsed * self.rate) + + if tokens < 1.0: + return JSONResponse( + {"detail": "Too many requests"}, + status_code=429, + headers={"Retry-After": "1"}, + ) + + self._buckets[ip] = (tokens - 1.0, now) + + # Periodic cleanup — evict stale entries every ~1000 requests + if len(self._buckets) > 5000: + cutoff = now - 60 + self._buckets = { + k: (t, ts) for k, (t, ts) in self._buckets.items() if ts > cutoff + } + + return await call_next(request) + + +# Request timeout middleware — kill requests that take too long +class TimeoutMiddleware(BaseHTTPMiddleware): + """Cancel requests that exceed a time limit.""" + + def __init__(self, app, timeout_seconds: float = 30.0): + super().__init__(app) + self.timeout = timeout_seconds + + async def dispatch(self, request: Request, call_next): + import asyncio + try: + return await asyncio.wait_for( + call_next(request), + timeout=self.timeout, + ) + except asyncio.TimeoutError: + return JSONResponse( + {"detail": "Request timeout"}, + status_code=504, + ) + + # Add GZip compression middleware (compress responses > 500 bytes) app.add_middleware(GZipMiddleware, minimum_size=500) @@ -245,6 +310,12 @@ app.add_middleware(CacheControlMiddleware) # Add bot logging middleware app.add_middleware(BotLoggerMiddleware) +# Add rate limiting (10 req/s per IP, burst of 50) +app.add_middleware(RateLimitMiddleware, requests_per_second=10.0) + +# Add request timeout (30 seconds max, 60 for PDFs handled by route-level timeout) +app.add_middleware(TimeoutMiddleware, timeout_seconds=30.0) + # Set up Jinja2 templates and static files current_dir = PathLib(__file__).parent diff --git a/nginx.conf b/nginx.conf new file mode 100644 index 0000000..a10eb09 --- /dev/null +++ b/nginx.conf @@ -0,0 +1,184 @@ +worker_processes auto; +error_log /var/log/nginx/error.log warn; +pid /var/run/nginx.pid; + +events { + worker_connections 1024; +} + +http { + include /etc/nginx/mime.types; + default_type application/octet-stream; + + log_format main '$remote_addr - [$time_local] "$request" $status $body_bytes_sent "$http_user_agent"'; + access_log /var/log/nginx/access.log main; + + sendfile on; + keepalive_timeout 65; + + # Gzip + gzip on; + gzip_vary on; + gzip_proxied any; + gzip_comp_level 6; + gzip_min_length 500; + gzip_types text/plain text/css text/xml text/javascript + application/json application/javascript application/xml + application/rss+xml image/svg+xml; + + # Upstream: FastAPI sidecar for dynamic routes + upstream sidecar { + server 127.0.0.1:8001; + } + + server { + listen 8000; + server_name _; + root /app/dist; + + # Security headers + add_header X-Content-Type-Options nosniff always; + add_header X-Frame-Options SAMEORIGIN always; + + # ----------------------------------------------------------- + # Dynamic routes — proxy to FastAPI sidecar + # ----------------------------------------------------------- + + # Search (dynamic query results) + location = /search { + proxy_pass http://sidecar; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + } + + # All API endpoints + location /api/ { + proxy_pass http://sidecar; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + } + + # PDF generation (on-demand) + location ~ /pdf$ { + proxy_pass http://sidecar; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + proxy_read_timeout 30s; + } + + # Verse of the day redirect (needs server-side date logic) + location = /verse-of-the-day { + proxy_pass http://sidecar; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + } + + # OG images (dynamically generated) + location /og/ { + proxy_pass http://sidecar; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + } + + # Family tree search (dynamic query) + location = /family-tree/search { + proxy_pass http://sidecar; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + } + + # Family tree SVG (dynamically rendered) + location = /family-tree/lineage.svg { + proxy_pass http://sidecar; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + } + + # OpenAPI docs + location /api/docs { + proxy_pass http://sidecar; + proxy_set_header Host $host; + } + location /api/redoc { + proxy_pass http://sidecar; + proxy_set_header Host $host; + } + location /api/openapi.json { + proxy_pass http://sidecar; + proxy_set_header Host $host; + } + + # ----------------------------------------------------------- + # Health check — static (no sidecar dependency) + # ----------------------------------------------------------- + location = /health { + default_type application/json; + return 200 '{"status":"healthy","service":"kjv-study"}'; + } + + # ----------------------------------------------------------- + # Static assets — aggressive caching + # ----------------------------------------------------------- + location /static/ { + expires 1y; + add_header Cache-Control "public, immutable"; + try_files $uri =404; + } + + # ----------------------------------------------------------- + # Robots / sitemaps + # ----------------------------------------------------------- + location = /robots.txt { + default_type text/plain; + expires 1d; + } + location ~ ^/sitemap.*\.xml$ { + default_type application/xml; + expires 1d; + } + + # Random verse list JSON + location = /random-verse-list.json { + default_type application/json; + expires 7d; + } + + # ----------------------------------------------------------- + # Default — serve pre-rendered HTML with clean URLs + # ----------------------------------------------------------- + location / { + try_files $uri $uri/index.html $uri/ @sidecar; + expires 7d; + add_header Cache-Control "public"; + } + + # Fallback: if no static file exists, proxy to sidecar + location @sidecar { + proxy_pass http://sidecar; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + } + + # Custom 404 + error_page 404 /404.html; + location = /404.html { + internal; + } + } +} diff --git a/pyproject.toml b/pyproject.toml index e6e8ac3..8b66f1c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,6 +7,7 @@ requires-python = ">=3.13" dependencies = [ "fastapi[standard]>=0.115.12", "ged4py>=0.5.2", + "gunicorn>=24.1.1", "mistune>=3.0.2", "parse>=1.20.2", "python-gedcom>=1.0.0", diff --git a/scripts/generate_static_site.py b/scripts/generate_static_site.py new file mode 100644 index 0000000..def39d5 --- /dev/null +++ b/scripts/generate_static_site.py @@ -0,0 +1,251 @@ +#!/usr/bin/env python3 +"""Static site generator for kjvstudy.org. + +Pre-renders high-traffic HTML pages (~1,300) using FastAPI's TestClient. +Everything else is served by the FastAPI sidecar at runtime. +""" + +import argparse +import json +import shutil +import sys +import time +from concurrent.futures import ThreadPoolExecutor, as_completed +from pathlib import Path + +PROJECT_ROOT = Path(__file__).resolve().parent.parent +sys.path.insert(0, str(PROJECT_ROOT)) + + +def create_app(): + from kjvstudy_org.server import app + return app + + +def get_test_client(app): + from fastapi.testclient import TestClient + return TestClient(app, raise_server_exceptions=False) + + +def enumerate_urls(): + """Enumerate high-traffic HTML pages only (~1,300 URLs). + + Covers: homepage, book listing, 66 book pages, ~1,189 chapter pages, + and a handful of top-level resource/about pages. + """ + from kjvstudy_org.kjv import bible + + urls = [ + "/", + "/books", + "/resources", + "/about", + "/about/stats", + "/about/cross-references", + "/about/accessibility", + "/about/commentary", + "/topics", + "/reading-plans", + "/study-guides", + "/stories", + "/stories/kids", + "/strongs", + "/strongs/hebrew", + "/strongs/greek", + "/interlinear", + "/family-tree", + "/biblical-timeline", + "/biblical-maps", + "/red-letter", + "/stars", + ] + + # 66 book pages + ~1,189 chapter pages + for book in bible.get_books(): + urls.append(f"/book/{book}") + for chapter in bible.get_chapters_for_book(book): + urls.append(f"/book/{book}/chapter/{chapter}") + + return urls + + +def url_to_filepath(output_dir: Path, url: str) -> Path: + path = url.strip("/") + if path == "": + return output_dir / "index.html" + return output_dir / path / "index.html" + + +def render_url(client, output_dir: Path, url: str) -> tuple[str, bool, str]: + try: + filepath = url_to_filepath(output_dir, url) + response = client.get(url) + + if response.status_code >= 400: + return (url, False, f"HTTP {response.status_code}") + + if response.status_code in (301, 302, 307, 308): + location = response.headers.get("location", "/") + redirect_html = ( + f'' + f'' + f'' + f'Redirecting...' + ) + filepath.parent.mkdir(parents=True, exist_ok=True) + filepath.write_text(redirect_html, encoding="utf-8") + return (url, True, "redirect") + + filepath.parent.mkdir(parents=True, exist_ok=True) + filepath.write_bytes(response.content) + return (url, True, "ok") + + except Exception as e: + return (url, False, str(e)[:200]) + + +def generate_random_verse_page(output_dir: Path): + """Generate /random-verse as a client-side JS page + verse-list JSON.""" + from kjvstudy_org.kjv import bible + + verse_urls = [] + for book in bible.get_books(): + for chapter in bible.get_chapters_for_book(book): + for v in bible.get_verses_by_book_chapter(book, chapter): + verse_urls.append(f"/book/{book}/chapter/{chapter}/verse/{v.verse}") + + json_path = output_dir / "random-verse-list.json" + json_path.write_text(json.dumps(verse_urls), encoding="utf-8") + + html = """ + + + + + Random Verse - KJV Study + + + + +
+

Random Verse

+

Picking a random verse…

+
+ + +""" + + html_path = output_dir / "random-verse" / "index.html" + html_path.parent.mkdir(parents=True, exist_ok=True) + html_path.write_text(html, encoding="utf-8") + print(f" Random verse page + {len(verse_urls)} verse list written") + + +def generate_utility_files(client, output_dir: Path): + for url, filename in [ + ("/robots.txt", "robots.txt"), + ("/sitemap.xml", "sitemap.xml"), + ("/sitemap-main.xml", "sitemap-main.xml"), + ("/sitemap-verses.xml", "sitemap-verses.xml"), + ]: + resp = client.get(url) + if resp.status_code == 200: + (output_dir / filename).write_bytes(resp.content) + print(f" {filename}") + + +def copy_static_assets(output_dir: Path): + src = PROJECT_ROOT / "kjvstudy_org" / "static" + dst = output_dir / "static" + if dst.exists(): + shutil.rmtree(dst) + skip = {"search_index.db", "scofield_commentary.json"} + shutil.copytree(src, dst, ignore=lambda d, files: [f for f in files if f in skip]) + print(f" Static assets copied to {dst}") + + +def main(): + parser = argparse.ArgumentParser(description="Generate static HTML site for kjvstudy.org") + parser.add_argument("--output", "-o", default="dist", help="Output directory") + parser.add_argument("--workers", "-w", type=int, default=4, help="Parallel workers") + parser.add_argument("--dry-run", action="store_true", help="Enumerate URLs without rendering") + args = parser.parse_args() + + output_dir = Path(args.output).resolve() + + print("Static site generator for kjvstudy.org") + print(f"Output: {output_dir}") + print(f"Workers: {args.workers}") + print() + + print("Initializing FastAPI app...") + app = create_app() + client = get_test_client(app) + client.__enter__() + + try: + print("Enumerating URLs...") + all_urls = enumerate_urls() + print(f" Total: {len(all_urls)} HTML pages") + + if args.dry_run: + print("\nDry run — not rendering.") + return + + if output_dir.exists(): + shutil.rmtree(output_dir) + output_dir.mkdir(parents=True) + + print("\nCopying static assets...") + copy_static_assets(output_dir) + + print("\nGenerating utility files...") + generate_utility_files(client, output_dir) + + print("\nGenerating random verse page...") + generate_random_verse_page(output_dir) + + print(f"\nRendering {len(all_urls)} HTML pages...") + start = time.time() + ok = 0 + fail = 0 + errors = [] + + with ThreadPoolExecutor(max_workers=args.workers) as pool: + futures = {pool.submit(render_url, client, output_dir, u): u for u in all_urls} + done = 0 + for future in as_completed(futures): + done += 1 + url, success, msg = future.result() + if success: + ok += 1 + else: + fail += 1 + errors.append((url, msg)) + + if done % 200 == 0 or done == len(all_urls): + elapsed = time.time() - start + rate = done / elapsed if elapsed > 0 else 0 + print(f" [{done}/{len(all_urls)}] {rate:.0f}/sec errors={fail}") + + elapsed = time.time() - start + print(f"\nDone in {elapsed:.1f}s") + print(f" Success: {ok}") + print(f" Errors: {fail}") + + if errors: + print(f"\nErrors:") + for url, msg in errors[:20]: + print(f" {url}: {msg}") + + finally: + client.__exit__(None, None, None) + + +if __name__ == "__main__": + main() diff --git a/scripts/search_api.py b/scripts/search_api.py new file mode 100644 index 0000000..7e6d257 --- /dev/null +++ b/scripts/search_api.py @@ -0,0 +1,43 @@ +#!/usr/bin/env python3 +"""FastAPI sidecar for kjvstudy.org static site deployment. + +Handles routes that can't be pre-rendered as static HTML: + - /search and /api/search (dynamic query) + - /api/* (JSON API endpoints) + - /*/pdf (on-demand PDF generation) + - /random-verse (server-side redirect fallback) + - /verse-of-the-day (redirect to today's date) + - /og/* (dynamic OG images) + +Runs on port 8001, proxied by nginx. +Uses gunicorn with uvicorn workers for resilience. +""" + +import os +import subprocess +import sys +from pathlib import Path + +PROJECT_ROOT = Path(__file__).resolve().parent.parent +sys.path.insert(0, str(PROJECT_ROOT)) + +from kjvstudy_org.server import app # noqa: E402, F401 + +if __name__ == "__main__": + port = int(os.getenv("SIDECAR_PORT", "8001")) + workers = int(os.getenv("SIDECAR_WORKERS", "1")) + + subprocess.run([ + sys.executable, "-m", "gunicorn", + "kjvstudy_org.server:app", + "--worker-class", "uvicorn.workers.UvicornWorker", + "--bind", f"0.0.0.0:{port}", + "--workers", str(workers), + "--max-requests", "1000", + "--max-requests-jitter", "200", + "--timeout", "60", + "--graceful-timeout", "10", + "--forwarded-allow-ips", "*", + "--log-level", "warning", + "--access-logfile", "-", + ]) diff --git a/uv.lock b/uv.lock index 1919bd2..a02e071 100644 --- a/uv.lock +++ b/uv.lock @@ -387,6 +387,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/57/bb/60f6f5162110ae5364af560c04e5aabdde8e5b8a43872cac3437333a5500/ged4py-0.5.2-py3-none-any.whl", hash = "sha256:33d24a4b06431c2b8931b430071e39421e7788b41b720b66dc2f35b8d825f521", size = 29920, upload-time = "2025-03-19T23:16:19.505Z" }, ] +[[package]] +name = "gunicorn" +version = "24.1.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "packaging" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/78/0a/10739c03537ec5b131a867bf94df2e412b437696c7e5d26970e2198a80d2/gunicorn-24.1.1.tar.gz", hash = "sha256:f006d110e5cb3102859b4f5cd48335dbd9cc28d0d27cd24ddbdafa6c60929408", size = 287567, upload-time = "2026-01-24T01:15:31.359Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/96/90/cfe637677916fc6f53cd2b05d5746e249f683e1fa14c9e745a88c66f7290/gunicorn-24.1.1-py3-none-any.whl", hash = "sha256:757f6b621fc4f7581a90600b2cd9df593461f06a41d7259cb9b94499dc4095a8", size = 114920, upload-time = "2026-01-24T01:15:29.656Z" }, +] + [[package]] name = "h11" version = "0.16.0" @@ -476,6 +488,7 @@ source = { editable = "." } dependencies = [ { name = "fastapi", extra = ["standard"] }, { name = "ged4py" }, + { name = "gunicorn" }, { name = "mistune" }, { name = "parse" }, { name = "python-gedcom" }, @@ -501,6 +514,7 @@ dev = [ requires-dist = [ { name = "fastapi", extras = ["standard"], specifier = ">=0.115.12" }, { name = "ged4py", specifier = ">=0.5.2" }, + { name = "gunicorn", specifier = ">=24.1.1" }, { name = "mistune", specifier = ">=3.0.2" }, { name = "parse", specifier = ">=1.20.2" }, { name = "pytest", marker = "extra == 'dev'", specifier = ">=8.3.5" },