Files
kennethreitz.org/tuftecms/utils/content.py
T
kennethreitz 088939c475 Extract titles from index.md for folder icon generation
Updated get_directory_structure() to:
- Check for index.md in directories
- Extract H1 title from index.md when present
- Use that title for generate_folder_icon() to create unique folder
  icons based on actual directory content
- Maintain existing behavior for files

This ensures directory listings show folder icons that reflect the
actual directory title rather than just the folder name.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-10-01 15:12:54 -04:00

540 lines
18 KiB
Python

"""Content processing utilities."""
import hashlib
import html
import re
from collections import Counter
from datetime import datetime
from functools import lru_cache
from pathlib import Path
from .svg_icons import generate_unique_svg_icon
TOKEN_PATTERN = re.compile(r"[a-z]{4,}")
STOPWORDS = {
"this",
"that",
"with",
"have",
"from",
"will",
"there",
"their",
"which",
"about",
"when",
"where",
"because",
"while",
"these",
"those",
"into",
"would",
"could",
"should",
"after",
"before",
"being",
"over",
"under",
"only",
"other",
"some",
"such",
"very",
"than",
"each",
"many",
"more",
"most",
"much",
"just",
"like",
"also",
"make",
"made",
"used",
"using",
"through",
"since",
"still",
"even",
"well",
"back",
"them",
"then",
"they",
"been",
"here",
"your",
"every",
"within",
"around",
"across",
}
@lru_cache(maxsize=1000)
def get_cached_markdown_title(file_path):
"""Extract and cache the H1 title from a markdown file for performance."""
try:
# Convert Path object to string for caching compatibility
file_path_str = str(file_path)
# Quick check - if file was modified recently, we might want to skip cache
# For now, let's just extract title efficiently
with open(file_path, "r", encoding="utf-8") as f:
content = f.read(1000) # Only read first 1000 chars to find title
# Look for first H1 markdown title
title_match = re.search(r"^#\s+(.+?)$", content, re.MULTILINE)
if title_match:
return title_match.group(1).strip()
# Fallback: look for HTML H1 if markdown was already rendered
title_match = re.search(r"<h1[^>]*>(.*?)</h1>", content, re.IGNORECASE)
if title_match:
# Remove HTML tags from title
title = re.sub(r"<[^>]+>", "", title_match.group(1))
return html.unescape(title).strip()
return None
except:
return None
@lru_cache(maxsize=500)
def generate_folder_icon(title, size=24):
"""Generate a folder icon with unique accent color based on title."""
hash_obj = hashlib.md5(title.encode())
hash_bytes = hash_obj.digest()
# Generate accent color
hue = (hash_bytes[0] * 360) // 256
saturation = 60 + (hash_bytes[1] * 20) // 256 # 60-80%
lightness = 45 + (hash_bytes[2] * 20) // 256 # 45-65%
accent_color = f"hsl({hue}, {saturation}%, {lightness}%)"
folder_base = "#e8e8e8"
svg = f"""<svg width="{size}" height="{size}" viewBox="0 0 24 24" xmlns="http://www.w3.org/2000/svg">
<defs>
<linearGradient id="folder_grad_{abs(hash(title)) % 1000}" x1="0%" y1="0%" x2="100%" y2="100%">
<stop offset="0%" stop-color="{folder_base}"/>
<stop offset="100%" stop-color="{accent_color}"/>
</linearGradient>
</defs>
<path d="M10 4H4c-1.11 0-2 .89-2 2v12c0 1.11.89 2 2 2h16c1.11 0 2-.89 2-2V8c0-1.11-.89-2-2-2h-8l-2-2z"
fill="url(#folder_grad_{abs(hash(title)) % 1000})"
stroke="{accent_color}"
stroke-width="0.5"/>
<circle cx="18" cy="7" r="2" fill="{accent_color}" opacity="0.8"/>
</svg>"""
import base64
svg_b64 = base64.b64encode(svg.encode()).decode()
return f"data:image/svg+xml;base64,{svg_b64}"
def get_directory_structure(path):
"""Get the directory structure for a given path."""
from pathlib import Path
DATA_DIR = Path("data")
items = []
if not path.exists() or not path.is_dir():
return items
# Separate directories and files for better organization
dirs = []
files = []
for item in sorted(path.iterdir(), reverse=True):
if (
item.name.startswith(".")
or item.name.lower() == "index.md"
or item.name.endswith(".bak")
):
continue
# Create display name without extension for files
display_name = item.stem if item.is_file() and item.suffix else item.name
display_name = display_name.replace("-", " ").replace("_", " ").title()
# Create clean URL path without .md extension
if item.is_dir():
url_path = "/" + str(item.relative_to(DATA_DIR)) + "/"
elif item.suffix == ".md":
# Remove .md extension for clean URLs
relative_path = str(item.relative_to(DATA_DIR))
url_path = "/" + relative_path[:-3] # Remove .md extension
else:
url_path = "/" + str(item.relative_to(DATA_DIR))
# Extract date from markdown files
file_date = None
if item.is_file() and item.suffix == ".md":
try:
with open(item, "r", encoding="utf-8") as f:
# Read first few lines to find date
for i, line in enumerate(f):
if i > 10: # Only check first 10 lines
break
# Look for date patterns like *January 2009* or *2014*
date_match = re.match(
r"^\*([A-Za-z]+ \d{4}|\d{4})\*\s*$", line.strip()
)
if date_match:
file_date = date_match.group(1)
break
except:
pass
# Generate unique SVG icon based on actual content title for consistency
icon_title = display_name # Default to filename-based display name
# For directories, try to get title from index.md
if item.is_dir():
try:
index_file = item / "index.md"
if index_file.exists():
cached_title = get_cached_markdown_title(index_file)
if cached_title:
icon_title = cached_title
# Also update display_name to use the actual title
display_name = cached_title
except:
# Fallback to filename-based display name if parsing fails
pass
# For markdown files, try to extract the actual H1 title from content
if item.is_file() and item.suffix == ".md":
try:
# Use cached title extraction for performance
cached_title = get_cached_markdown_title(item)
if cached_title:
icon_title = cached_title
# Also update display_name to use the actual title
display_name = cached_title
except:
# Fallback to filename-based display name if parsing fails
pass
if item.is_dir():
unique_icon = generate_folder_icon(icon_title, size=32)
else:
unique_icon = generate_unique_svg_icon(icon_title, size=32)
item_info = {
"name": item.name,
"display_name": display_name,
"path": str(item.relative_to(DATA_DIR)),
"url_path": url_path,
"is_dir": item.is_dir(),
"is_markdown": item.suffix == ".md",
"is_image": item.suffix.lower()
in [".jpg", ".jpeg", ".png", ".gif", ".webp"],
"size": item.stat().st_size if item.is_file() else None,
"created": datetime.fromtimestamp(item.stat().st_ctime),
"modified": datetime.fromtimestamp(item.stat().st_mtime),
"file_date": file_date, # Date extracted from file content
"file_type": item.suffix.lower() if item.is_file() else "directory",
"static_path": (
f"/static/data/{item.relative_to(DATA_DIR)}"
if not item.is_dir()
else None
),
"unique_icon": unique_icon, # Generated SVG icon
}
if item.is_dir():
dirs.append(item_info)
else:
files.append(item_info)
# Return directories first, then files
return dirs + files
def calculate_reading_time(text):
"""Calculate estimated reading time based on word count."""
# Remove HTML tags for more accurate word count
clean_text = re.sub(r"<[^>]+>", "", text)
# Average reading speed is 200-250 words per minute, using 225 as middle ground
word_count = len(clean_text.split())
reading_time = max(1, round(word_count / 225)) # Minimum 1 minute
return reading_time, word_count
def extract_tags_from_content(content, metadata, file_path):
"""Extract tags from content and metadata for categorization."""
tags = set()
# Only use explicitly defined tags from YAML front matter
if metadata.get("tags"):
if isinstance(metadata["tags"], list):
tags.update(tag.lower().strip() for tag in metadata["tags"])
else:
tags.update(tag.lower().strip() for tag in str(metadata["tags"]).split(","))
return list(tags)
def find_series_posts(metadata, current_path):
"""Find all posts in the same series as the current post."""
import os
import yaml
from pathlib import Path
DATA_DIR = Path("data")
series_posts = []
if not metadata.get("series"):
return series_posts
series_name = metadata["series"]
# Search through all markdown files to find posts in the same series
for root, dirs, files in os.walk(DATA_DIR):
for file in files:
if file.endswith(".md") and file != "index.md":
file_path = Path(root) / file
# Skip the current file
if str(file_path) == str(current_path):
continue
try:
with open(file_path, "r", encoding="utf-8") as f:
content = f.read()
# Extract metadata
yaml_pattern = r"^---\s*\n(.*?)\n---\s*\n"
yaml_match = re.match(yaml_pattern, content, re.DOTALL)
if yaml_match:
post_metadata = yaml.safe_load(yaml_match.group(1)) or {}
if post_metadata.get("series") == series_name:
# Create URL path for this post
relative_path = str(file_path.relative_to(DATA_DIR))
url_path = "/" + relative_path[:-3] # Remove .md
# Get title from metadata or filename
title = (
post_metadata.get("title")
or file_path.stem.replace("-", " ").title()
)
series_posts.append(
{
"title": title,
"url": url_path,
"order": post_metadata.get("series_order", 999),
"description": post_metadata.get("description", ""),
}
)
except:
continue
# Sort by series_order
series_posts.sort(key=lambda x: x["order"])
return series_posts
def extract_intelligent_date(item_path, content_data=None):
"""Extract date intelligently from various sources."""
from datetime import datetime
import re
# PRIORITY 1: Try to extract from filename first (most reliable)
filename = item_path.name
# Try full date format first (YYYY-MM-DD) - must have day followed by dash
date_match = re.match(r"^(\d{4})-(\d{2})-(\d{2})-", filename)
if date_match:
year, month, day = map(int, date_match.groups())
return datetime(year, month, day)
# Try year-month format (YYYY-MM) - must NOT have a day after month
date_match = re.match(r"^(\d{4})-(\d{2})-(?!\d{2})", filename)
if date_match:
year, month = map(int, date_match.groups())
return datetime(year, month, 1) # Default to first of month
# PRIORITY 2: Try to get date from metadata
if content_data and content_data.get("metadata", {}).get("date"):
date_str = content_data["metadata"]["date"]
try:
# Try various date formats
for fmt in ["%Y-%m-%d", "%B %Y", "%Y", "%B %d, %Y"]:
try:
return datetime.strptime(str(date_str), fmt)
except ValueError:
continue
except:
pass
# PRIORITY 3: Fall back to file modification time
try:
return datetime.fromtimestamp(item_path.stat().st_mtime)
except:
return None
def find_related_posts(current_post_path, limit=3):
"""Find related posts based on tags and content similarity."""
from ..core.cache import get_blog_cache
blog_data = get_blog_cache() or {}
posts = blog_data.get("posts", [])
if not posts:
return []
def normalize(path_value):
path_str = str(path_value).replace("\\", "/")
return path_str[2:] if path_str.startswith("./") else path_str
post_lookup = {}
for post in posts:
file_path = post.get("file_path")
if not file_path:
continue
post_lookup[normalize(file_path)] = post
current_key = normalize(Path(current_post_path))
current_post = post_lookup.get(current_key)
if not current_post:
return []
@lru_cache(maxsize=512)
def get_terms(post_key):
post = post_lookup.get(post_key)
if not post:
return {}, set()
content = (post.get("content") or "").lower()
words = [w for w in TOKEN_PATTERN.findall(content) if w not in STOPWORDS]
if not words:
return {}, set()
top_terms = dict(Counter(words).most_common(40))
title_terms = {
w
for w in TOKEN_PATTERN.findall((post.get("title") or "").lower())
if w not in STOPWORDS
}
return top_terms, title_terms
def fallback_posts():
fallback = [post for key, post in post_lookup.items() if key != current_key]
fallback.sort(key=lambda p: (p.get("pub_date") or datetime.min), reverse=True)
return [compact(post) for post in fallback[:limit]]
current_terms, current_title_terms = get_terms(current_key)
if not current_terms and not current_title_terms:
return fallback_posts()
current_weight = sum(current_terms.values()) or 1
current_date = current_post.get("pub_date")
def compact(post):
return {
"title": post.get("title"),
"url": post.get("url") or post.get("path"),
"description": post.get("description"),
"pub_date": post.get("pub_date"),
"date_str": post.get("date_str"),
"unique_icon": post.get("unique_icon"),
}
scored_posts = []
for key, candidate in post_lookup.items():
if key == current_key:
continue
candidate_terms, candidate_title_terms = get_terms(key)
shared_terms = set(current_terms).intersection(candidate_terms)
shared_weight = sum(
min(current_terms[word], candidate_terms[word]) for word in shared_terms
)
normalization = 0
if shared_weight:
candidate_weight = sum(candidate_terms.values()) or 1
normalization = (shared_weight * 100) // (current_weight + candidate_weight)
title_overlap = len(current_title_terms & candidate_title_terms)
date_bonus = 0
candidate_date = candidate.get("pub_date")
if current_date and candidate_date:
month_delta = abs(
(current_date.year - candidate_date.year) * 12
+ (current_date.month - candidate_date.month)
)
if month_delta < 24:
date_bonus = 24 - month_delta
score = shared_weight * 10 + normalization * 2 + title_overlap * 5 + date_bonus
if score:
scored_posts.append((score, candidate))
if not scored_posts:
return fallback_posts()
scored_posts.sort(
key=lambda item: (
item[0],
item[1].get("pub_date") or datetime.min,
),
reverse=True,
)
return [compact(post) for score, post in scored_posts[:limit]]
def find_adjacent_posts(current_post_path):
"""Find previous and next posts in chronological order."""
from ..core.cache import get_blog_cache
blog_data = get_blog_cache() or {}
posts = blog_data.get("posts", [])
if not posts:
return {"prev": None, "next": None}
def normalize(path_value):
path_str = str(path_value).replace("\\", "/")
return path_str[2:] if path_str.startswith("./") else path_str
current_key = normalize(Path(current_post_path))
current_index = None
for idx, post in enumerate(posts):
file_path = post.get("file_path")
if file_path and normalize(file_path) == current_key:
current_index = idx
break
if current_index is None:
return {"prev": None, "next": None}
def compact_post(post):
if not post:
return None
return {
"title": post.get("title"),
"url": post.get("url") or post.get("path"),
"pub_date": post.get("pub_date"),
"date_str": post.get("date_str"),
"description": post.get("description"),
"unique_icon": post.get("unique_icon"),
}
prev_post = posts[current_index + 1] if current_index + 1 < len(posts) else None
next_post = posts[current_index - 1] if current_index - 1 >= 0 else None
return {"prev": compact_post(prev_post), "next": compact_post(next_post)}