Files
kennethreitz.org/tuftecms/core/markdown.py
T
kennethreitz 0352d10279 Move oEmbed fetching to client-side for faster, more reliable page loads
Server no longer makes HTTP requests during markdown rendering. Bare URLs
render as placeholders with fallback links, and JS fetches oEmbed data
asynchronously via a new /api/oembed proxy endpoint.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-10 09:43:28 -04:00

222 lines
8.3 KiB
Python

"""Markdown processing core module."""
import re
from html import escape as html_escape
from urllib.parse import quote as url_quote
import mistune
import yaml
from ..utils.svg_icons import generate_unique_svg_icon
from ..utils.content import (
calculate_reading_time,
extract_tags_from_content,
find_series_posts,
)
def _process_oembed(html):
"""Replace bare URLs in paragraphs with client-side oEmbed placeholders."""
def _replace_link(match):
url = match.group(1)
# SoundCloud: render inline, no discovery needed.
if re.match(r"https?://(?:www\.)?soundcloud\.com/.+", url):
encoded = url_quote(url, safe="")
return (
f'<iframe width="100%" height="20" scrolling="no" frameborder="no"'
f' src="https://w.soundcloud.com/player/?url={encoded}'
f'&color=%23333333&auto_play=false&hide_related=true'
f'&show_comments=false&show_user=false&show_reposts=false'
f'&show_teaser=false&show_artwork=false&visual=false"></iframe>'
)
# Everything else: render a placeholder for client-side oEmbed.
safe_url = html_escape(url, quote=True)
return (
f'<div class="oembed-placeholder" data-oembed-url="{safe_url}">'
f'<a href="{safe_url}" target="_blank" rel="noopener">{safe_url}</a>'
f'</div>'
)
# Match any <p> containing only a single bare <a> link (any https URL).
return re.sub(
r'<p><a href="(https?://[^"]+)">[^<]+</a></p>',
_replace_link,
html,
)
class TufteMarkdownRenderer:
"""Custom Markdown renderer for Tufte-style output."""
def __init__(self):
"""Initialize the renderer with Mistune."""
self.markdown = mistune.create_markdown(
escape=False,
plugins=[
"strikethrough",
"footnotes",
"table",
"task_lists",
"def_list",
"url",
],
)
def render(self, content):
"""Render markdown content to HTML."""
html = self.markdown(content.strip())
return _process_oembed(html)
def render_markdown_file(file_path):
"""Render a markdown file to HTML with metadata extraction."""
try:
with open(file_path, "r", encoding="utf-8") as f:
content = f.read()
# Extract YAML front matter if it exists
metadata = {}
yaml_pattern = r"^---\s*\n(.*?)\n---\s*\n"
yaml_match = re.match(yaml_pattern, content, re.DOTALL)
if yaml_match:
try:
metadata = yaml.safe_load(yaml_match.group(1)) or {}
content = content[yaml_match.end() :]
except:
pass
# Extract first h1 header if it exists
first_h1 = None
# Look for the first H1 at the start of the file (must be on first line or after blank line)
h1_match = re.search(r"^# (.+?)$", content, re.MULTILINE)
if h1_match:
first_h1 = h1_match.group(1).strip()
# Remove only the first h1 line from content to avoid duplication
content = re.sub(r"^# .+?$", "", content, count=1, flags=re.MULTILINE)
# Extract date from italic date pattern (e.g., "*August 2025*")
# Only match dates that look like month/year patterns, not quotes or long text
date_match = re.search(
r"^\*([A-Za-z]+ \d{4}|\d{4})\*\s*$", content, re.MULTILINE
)
if date_match and not metadata.get("date"):
date_text = date_match.group(1).strip()
# Skip only if it's "January 2025" (current year placeholder)
if not (date_text.lower().startswith("january") and "2025" in date_text):
# Format "January YYYY" (not 2025) as just "YYYY" for cleaner display
if (
re.match(r"^january\s+(\d{4})$", date_text.lower())
and "2025" not in date_text
):
year_match = re.search(r"(\d{4})", date_text)
if year_match:
date_text = year_match.group(1)
# Keep other months like "August 2025" as full format
metadata["date"] = date_text
# Remove the date line from content
content = re.sub(
r"^\*([A-Za-z]+ \d{4}|\d{4})\*\s*$",
"",
content,
count=1,
flags=re.MULTILINE,
)
# Create renderer and process content
renderer = TufteMarkdownRenderer()
html_content = renderer.render(content)
# Add anchor IDs to headings using post-processing on HTML
html_content = add_heading_anchor_ids(html_content)
# Post-processing for poetry line breaks
# Check if this is likely a poetry file based on file path
if file_path and "poetry" in str(file_path):
# For poetry, convert single line breaks within paragraphs to <br> tags
html_content = re.sub(
r"<p>(.*?)</p>",
lambda m: "<p>" + m.group(1).replace("\n", "<br>\n") + "</p>",
html_content,
flags=re.DOTALL,
)
# Add classes to headers to prevent conflicts with page headers
html_content = html_content.replace("<h1>", '<h1 class="content-header">')
html_content = html_content.replace("<h2>", '<h2 class="content-header">')
html_content = html_content.replace("<h3>", '<h3 class="content-header">')
html_content = html_content.replace("<h4>", '<h4 class="content-header">')
html_content = html_content.replace("<h5>", '<h5 class="content-header">')
html_content = html_content.replace("<h6>", '<h6 class="content-header">')
# Use the first h1 as title if available, otherwise fallback to metadata or filename
if first_h1:
title = first_h1
elif "title" in metadata:
title = metadata["title"]
else:
title = file_path.stem.replace("-", " ").replace("_", " ").title()
# Calculate reading time
reading_time, word_count = calculate_reading_time(html_content)
# Extract tags
tags = extract_tags_from_content(html_content, metadata, file_path)
# Find series posts if this post is part of a series
series_posts = find_series_posts(metadata, file_path)
# Generate unique icon for this content
# Use folder icon for index.md files (directory pages)
if file_path.name == "index.md":
from ..utils.content import generate_folder_icon
unique_icon = generate_folder_icon(title, size=32)
else:
unique_icon = generate_unique_svg_icon(title, size=32)
return {
"content": html_content,
"title": title,
"metadata": metadata,
"reading_time": reading_time,
"word_count": word_count,
"tags": tags,
"series_posts": series_posts,
"series_name": metadata.get("series"),
"unique_icon": unique_icon,
}
except Exception as e:
return {
"content": f"<p>Error reading file: {str(e)}</p>",
"title": "Error",
"metadata": {},
}
def add_heading_anchor_ids(html_content):
"""Add anchor IDs to headings for navigation."""
def replace_heading(match):
tag = match.group(1) # h1, h2, etc.
level = int(tag[1]) # 1, 2, etc.
classes = match.group(2) or "" # existing classes if any
text = match.group(3)
# Generate anchor ID from heading text (remove HTML tags first)
clean_text = re.sub(r"<[^>]+>", "", text)
anchor_id = re.sub(r"[^\w\s-]", "", clean_text.lower()).replace(" ", "-")
anchor_id = re.sub(r"-+", "-", anchor_id).strip("-") # Clean up multiple dashes
# Add id attribute, preserving any existing classes
if classes:
return f'<{tag} id="{anchor_id}"{classes}>{text}</{tag}>'
else:
return f'<{tag} id="{anchor_id}">{text}</{tag}>'
# Match h1-h6 tags with optional class attributes
return re.sub(
r"<(h[1-6])(\s+[^>]*)?>([^<]+)</h[1-6]>", replace_heading, html_content
)