mirror of
https://github.com/kennethreitz/kennethreitz.org.git
synced 2026-06-05 22:50:17 +00:00
088939c475
Updated get_directory_structure() to: - Check for index.md in directories - Extract H1 title from index.md when present - Use that title for generate_folder_icon() to create unique folder icons based on actual directory content - Maintain existing behavior for files This ensures directory listings show folder icons that reflect the actual directory title rather than just the folder name. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
540 lines
18 KiB
Python
540 lines
18 KiB
Python
"""Content processing utilities."""
|
|
|
|
import hashlib
|
|
import html
|
|
import re
|
|
from collections import Counter
|
|
from datetime import datetime
|
|
from functools import lru_cache
|
|
from pathlib import Path
|
|
|
|
from .svg_icons import generate_unique_svg_icon
|
|
|
|
|
|
TOKEN_PATTERN = re.compile(r"[a-z]{4,}")
|
|
STOPWORDS = {
|
|
"this",
|
|
"that",
|
|
"with",
|
|
"have",
|
|
"from",
|
|
"will",
|
|
"there",
|
|
"their",
|
|
"which",
|
|
"about",
|
|
"when",
|
|
"where",
|
|
"because",
|
|
"while",
|
|
"these",
|
|
"those",
|
|
"into",
|
|
"would",
|
|
"could",
|
|
"should",
|
|
"after",
|
|
"before",
|
|
"being",
|
|
"over",
|
|
"under",
|
|
"only",
|
|
"other",
|
|
"some",
|
|
"such",
|
|
"very",
|
|
"than",
|
|
"each",
|
|
"many",
|
|
"more",
|
|
"most",
|
|
"much",
|
|
"just",
|
|
"like",
|
|
"also",
|
|
"make",
|
|
"made",
|
|
"used",
|
|
"using",
|
|
"through",
|
|
"since",
|
|
"still",
|
|
"even",
|
|
"well",
|
|
"back",
|
|
"them",
|
|
"then",
|
|
"they",
|
|
"been",
|
|
"here",
|
|
"your",
|
|
"every",
|
|
"within",
|
|
"around",
|
|
"across",
|
|
}
|
|
|
|
|
|
@lru_cache(maxsize=1000)
|
|
def get_cached_markdown_title(file_path):
|
|
"""Extract and cache the H1 title from a markdown file for performance."""
|
|
try:
|
|
# Convert Path object to string for caching compatibility
|
|
file_path_str = str(file_path)
|
|
|
|
# Quick check - if file was modified recently, we might want to skip cache
|
|
# For now, let's just extract title efficiently
|
|
with open(file_path, "r", encoding="utf-8") as f:
|
|
content = f.read(1000) # Only read first 1000 chars to find title
|
|
|
|
# Look for first H1 markdown title
|
|
title_match = re.search(r"^#\s+(.+?)$", content, re.MULTILINE)
|
|
if title_match:
|
|
return title_match.group(1).strip()
|
|
|
|
# Fallback: look for HTML H1 if markdown was already rendered
|
|
title_match = re.search(r"<h1[^>]*>(.*?)</h1>", content, re.IGNORECASE)
|
|
if title_match:
|
|
# Remove HTML tags from title
|
|
title = re.sub(r"<[^>]+>", "", title_match.group(1))
|
|
return html.unescape(title).strip()
|
|
|
|
return None
|
|
except:
|
|
return None
|
|
|
|
|
|
@lru_cache(maxsize=500)
|
|
def generate_folder_icon(title, size=24):
|
|
"""Generate a folder icon with unique accent color based on title."""
|
|
hash_obj = hashlib.md5(title.encode())
|
|
hash_bytes = hash_obj.digest()
|
|
|
|
# Generate accent color
|
|
hue = (hash_bytes[0] * 360) // 256
|
|
saturation = 60 + (hash_bytes[1] * 20) // 256 # 60-80%
|
|
lightness = 45 + (hash_bytes[2] * 20) // 256 # 45-65%
|
|
|
|
accent_color = f"hsl({hue}, {saturation}%, {lightness}%)"
|
|
folder_base = "#e8e8e8"
|
|
|
|
svg = f"""<svg width="{size}" height="{size}" viewBox="0 0 24 24" xmlns="http://www.w3.org/2000/svg">
|
|
<defs>
|
|
<linearGradient id="folder_grad_{abs(hash(title)) % 1000}" x1="0%" y1="0%" x2="100%" y2="100%">
|
|
<stop offset="0%" stop-color="{folder_base}"/>
|
|
<stop offset="100%" stop-color="{accent_color}"/>
|
|
</linearGradient>
|
|
</defs>
|
|
<path d="M10 4H4c-1.11 0-2 .89-2 2v12c0 1.11.89 2 2 2h16c1.11 0 2-.89 2-2V8c0-1.11-.89-2-2-2h-8l-2-2z"
|
|
fill="url(#folder_grad_{abs(hash(title)) % 1000})"
|
|
stroke="{accent_color}"
|
|
stroke-width="0.5"/>
|
|
<circle cx="18" cy="7" r="2" fill="{accent_color}" opacity="0.8"/>
|
|
</svg>"""
|
|
|
|
import base64
|
|
|
|
svg_b64 = base64.b64encode(svg.encode()).decode()
|
|
return f"data:image/svg+xml;base64,{svg_b64}"
|
|
|
|
|
|
def get_directory_structure(path):
|
|
"""Get the directory structure for a given path."""
|
|
from pathlib import Path
|
|
|
|
DATA_DIR = Path("data")
|
|
items = []
|
|
if not path.exists() or not path.is_dir():
|
|
return items
|
|
|
|
# Separate directories and files for better organization
|
|
dirs = []
|
|
files = []
|
|
|
|
for item in sorted(path.iterdir(), reverse=True):
|
|
if (
|
|
item.name.startswith(".")
|
|
or item.name.lower() == "index.md"
|
|
or item.name.endswith(".bak")
|
|
):
|
|
continue
|
|
|
|
# Create display name without extension for files
|
|
display_name = item.stem if item.is_file() and item.suffix else item.name
|
|
display_name = display_name.replace("-", " ").replace("_", " ").title()
|
|
|
|
# Create clean URL path without .md extension
|
|
if item.is_dir():
|
|
url_path = "/" + str(item.relative_to(DATA_DIR)) + "/"
|
|
elif item.suffix == ".md":
|
|
# Remove .md extension for clean URLs
|
|
relative_path = str(item.relative_to(DATA_DIR))
|
|
url_path = "/" + relative_path[:-3] # Remove .md extension
|
|
else:
|
|
url_path = "/" + str(item.relative_to(DATA_DIR))
|
|
|
|
# Extract date from markdown files
|
|
file_date = None
|
|
if item.is_file() and item.suffix == ".md":
|
|
try:
|
|
with open(item, "r", encoding="utf-8") as f:
|
|
# Read first few lines to find date
|
|
for i, line in enumerate(f):
|
|
if i > 10: # Only check first 10 lines
|
|
break
|
|
# Look for date patterns like *January 2009* or *2014*
|
|
date_match = re.match(
|
|
r"^\*([A-Za-z]+ \d{4}|\d{4})\*\s*$", line.strip()
|
|
)
|
|
if date_match:
|
|
file_date = date_match.group(1)
|
|
break
|
|
except:
|
|
pass
|
|
|
|
# Generate unique SVG icon based on actual content title for consistency
|
|
icon_title = display_name # Default to filename-based display name
|
|
|
|
# For directories, try to get title from index.md
|
|
if item.is_dir():
|
|
try:
|
|
index_file = item / "index.md"
|
|
if index_file.exists():
|
|
cached_title = get_cached_markdown_title(index_file)
|
|
if cached_title:
|
|
icon_title = cached_title
|
|
# Also update display_name to use the actual title
|
|
display_name = cached_title
|
|
except:
|
|
# Fallback to filename-based display name if parsing fails
|
|
pass
|
|
|
|
# For markdown files, try to extract the actual H1 title from content
|
|
if item.is_file() and item.suffix == ".md":
|
|
try:
|
|
# Use cached title extraction for performance
|
|
cached_title = get_cached_markdown_title(item)
|
|
if cached_title:
|
|
icon_title = cached_title
|
|
# Also update display_name to use the actual title
|
|
display_name = cached_title
|
|
except:
|
|
# Fallback to filename-based display name if parsing fails
|
|
pass
|
|
|
|
if item.is_dir():
|
|
unique_icon = generate_folder_icon(icon_title, size=32)
|
|
else:
|
|
unique_icon = generate_unique_svg_icon(icon_title, size=32)
|
|
|
|
item_info = {
|
|
"name": item.name,
|
|
"display_name": display_name,
|
|
"path": str(item.relative_to(DATA_DIR)),
|
|
"url_path": url_path,
|
|
"is_dir": item.is_dir(),
|
|
"is_markdown": item.suffix == ".md",
|
|
"is_image": item.suffix.lower()
|
|
in [".jpg", ".jpeg", ".png", ".gif", ".webp"],
|
|
"size": item.stat().st_size if item.is_file() else None,
|
|
"created": datetime.fromtimestamp(item.stat().st_ctime),
|
|
"modified": datetime.fromtimestamp(item.stat().st_mtime),
|
|
"file_date": file_date, # Date extracted from file content
|
|
"file_type": item.suffix.lower() if item.is_file() else "directory",
|
|
"static_path": (
|
|
f"/static/data/{item.relative_to(DATA_DIR)}"
|
|
if not item.is_dir()
|
|
else None
|
|
),
|
|
"unique_icon": unique_icon, # Generated SVG icon
|
|
}
|
|
|
|
if item.is_dir():
|
|
dirs.append(item_info)
|
|
else:
|
|
files.append(item_info)
|
|
|
|
# Return directories first, then files
|
|
return dirs + files
|
|
|
|
|
|
def calculate_reading_time(text):
|
|
"""Calculate estimated reading time based on word count."""
|
|
# Remove HTML tags for more accurate word count
|
|
clean_text = re.sub(r"<[^>]+>", "", text)
|
|
# Average reading speed is 200-250 words per minute, using 225 as middle ground
|
|
word_count = len(clean_text.split())
|
|
reading_time = max(1, round(word_count / 225)) # Minimum 1 minute
|
|
return reading_time, word_count
|
|
|
|
|
|
def extract_tags_from_content(content, metadata, file_path):
|
|
"""Extract tags from content and metadata for categorization."""
|
|
tags = set()
|
|
|
|
# Only use explicitly defined tags from YAML front matter
|
|
if metadata.get("tags"):
|
|
if isinstance(metadata["tags"], list):
|
|
tags.update(tag.lower().strip() for tag in metadata["tags"])
|
|
else:
|
|
tags.update(tag.lower().strip() for tag in str(metadata["tags"]).split(","))
|
|
|
|
return list(tags)
|
|
|
|
|
|
def find_series_posts(metadata, current_path):
|
|
"""Find all posts in the same series as the current post."""
|
|
import os
|
|
import yaml
|
|
from pathlib import Path
|
|
|
|
DATA_DIR = Path("data")
|
|
series_posts = []
|
|
if not metadata.get("series"):
|
|
return series_posts
|
|
|
|
series_name = metadata["series"]
|
|
|
|
# Search through all markdown files to find posts in the same series
|
|
for root, dirs, files in os.walk(DATA_DIR):
|
|
for file in files:
|
|
if file.endswith(".md") and file != "index.md":
|
|
file_path = Path(root) / file
|
|
|
|
# Skip the current file
|
|
if str(file_path) == str(current_path):
|
|
continue
|
|
|
|
try:
|
|
with open(file_path, "r", encoding="utf-8") as f:
|
|
content = f.read()
|
|
|
|
# Extract metadata
|
|
yaml_pattern = r"^---\s*\n(.*?)\n---\s*\n"
|
|
yaml_match = re.match(yaml_pattern, content, re.DOTALL)
|
|
if yaml_match:
|
|
post_metadata = yaml.safe_load(yaml_match.group(1)) or {}
|
|
|
|
if post_metadata.get("series") == series_name:
|
|
# Create URL path for this post
|
|
relative_path = str(file_path.relative_to(DATA_DIR))
|
|
url_path = "/" + relative_path[:-3] # Remove .md
|
|
|
|
# Get title from metadata or filename
|
|
title = (
|
|
post_metadata.get("title")
|
|
or file_path.stem.replace("-", " ").title()
|
|
)
|
|
|
|
series_posts.append(
|
|
{
|
|
"title": title,
|
|
"url": url_path,
|
|
"order": post_metadata.get("series_order", 999),
|
|
"description": post_metadata.get("description", ""),
|
|
}
|
|
)
|
|
except:
|
|
continue
|
|
|
|
# Sort by series_order
|
|
series_posts.sort(key=lambda x: x["order"])
|
|
return series_posts
|
|
|
|
|
|
def extract_intelligent_date(item_path, content_data=None):
|
|
"""Extract date intelligently from various sources."""
|
|
from datetime import datetime
|
|
import re
|
|
|
|
# PRIORITY 1: Try to extract from filename first (most reliable)
|
|
filename = item_path.name
|
|
|
|
# Try full date format first (YYYY-MM-DD) - must have day followed by dash
|
|
date_match = re.match(r"^(\d{4})-(\d{2})-(\d{2})-", filename)
|
|
if date_match:
|
|
year, month, day = map(int, date_match.groups())
|
|
return datetime(year, month, day)
|
|
|
|
# Try year-month format (YYYY-MM) - must NOT have a day after month
|
|
date_match = re.match(r"^(\d{4})-(\d{2})-(?!\d{2})", filename)
|
|
if date_match:
|
|
year, month = map(int, date_match.groups())
|
|
return datetime(year, month, 1) # Default to first of month
|
|
|
|
# PRIORITY 2: Try to get date from metadata
|
|
if content_data and content_data.get("metadata", {}).get("date"):
|
|
date_str = content_data["metadata"]["date"]
|
|
try:
|
|
# Try various date formats
|
|
for fmt in ["%Y-%m-%d", "%B %Y", "%Y", "%B %d, %Y"]:
|
|
try:
|
|
return datetime.strptime(str(date_str), fmt)
|
|
except ValueError:
|
|
continue
|
|
except:
|
|
pass
|
|
|
|
# PRIORITY 3: Fall back to file modification time
|
|
try:
|
|
return datetime.fromtimestamp(item_path.stat().st_mtime)
|
|
except:
|
|
return None
|
|
|
|
|
|
def find_related_posts(current_post_path, limit=3):
|
|
"""Find related posts based on tags and content similarity."""
|
|
from ..core.cache import get_blog_cache
|
|
|
|
blog_data = get_blog_cache() or {}
|
|
posts = blog_data.get("posts", [])
|
|
if not posts:
|
|
return []
|
|
|
|
def normalize(path_value):
|
|
path_str = str(path_value).replace("\\", "/")
|
|
return path_str[2:] if path_str.startswith("./") else path_str
|
|
|
|
post_lookup = {}
|
|
for post in posts:
|
|
file_path = post.get("file_path")
|
|
if not file_path:
|
|
continue
|
|
post_lookup[normalize(file_path)] = post
|
|
|
|
current_key = normalize(Path(current_post_path))
|
|
current_post = post_lookup.get(current_key)
|
|
if not current_post:
|
|
return []
|
|
|
|
@lru_cache(maxsize=512)
|
|
def get_terms(post_key):
|
|
post = post_lookup.get(post_key)
|
|
if not post:
|
|
return {}, set()
|
|
|
|
content = (post.get("content") or "").lower()
|
|
words = [w for w in TOKEN_PATTERN.findall(content) if w not in STOPWORDS]
|
|
if not words:
|
|
return {}, set()
|
|
|
|
top_terms = dict(Counter(words).most_common(40))
|
|
title_terms = {
|
|
w
|
|
for w in TOKEN_PATTERN.findall((post.get("title") or "").lower())
|
|
if w not in STOPWORDS
|
|
}
|
|
return top_terms, title_terms
|
|
|
|
def fallback_posts():
|
|
fallback = [post for key, post in post_lookup.items() if key != current_key]
|
|
fallback.sort(key=lambda p: (p.get("pub_date") or datetime.min), reverse=True)
|
|
return [compact(post) for post in fallback[:limit]]
|
|
|
|
current_terms, current_title_terms = get_terms(current_key)
|
|
if not current_terms and not current_title_terms:
|
|
return fallback_posts()
|
|
|
|
current_weight = sum(current_terms.values()) or 1
|
|
current_date = current_post.get("pub_date")
|
|
|
|
def compact(post):
|
|
return {
|
|
"title": post.get("title"),
|
|
"url": post.get("url") or post.get("path"),
|
|
"description": post.get("description"),
|
|
"pub_date": post.get("pub_date"),
|
|
"date_str": post.get("date_str"),
|
|
"unique_icon": post.get("unique_icon"),
|
|
}
|
|
|
|
scored_posts = []
|
|
for key, candidate in post_lookup.items():
|
|
if key == current_key:
|
|
continue
|
|
|
|
candidate_terms, candidate_title_terms = get_terms(key)
|
|
|
|
shared_terms = set(current_terms).intersection(candidate_terms)
|
|
shared_weight = sum(
|
|
min(current_terms[word], candidate_terms[word]) for word in shared_terms
|
|
)
|
|
|
|
normalization = 0
|
|
if shared_weight:
|
|
candidate_weight = sum(candidate_terms.values()) or 1
|
|
normalization = (shared_weight * 100) // (current_weight + candidate_weight)
|
|
|
|
title_overlap = len(current_title_terms & candidate_title_terms)
|
|
|
|
date_bonus = 0
|
|
candidate_date = candidate.get("pub_date")
|
|
if current_date and candidate_date:
|
|
month_delta = abs(
|
|
(current_date.year - candidate_date.year) * 12
|
|
+ (current_date.month - candidate_date.month)
|
|
)
|
|
if month_delta < 24:
|
|
date_bonus = 24 - month_delta
|
|
|
|
score = shared_weight * 10 + normalization * 2 + title_overlap * 5 + date_bonus
|
|
|
|
if score:
|
|
scored_posts.append((score, candidate))
|
|
|
|
if not scored_posts:
|
|
return fallback_posts()
|
|
|
|
scored_posts.sort(
|
|
key=lambda item: (
|
|
item[0],
|
|
item[1].get("pub_date") or datetime.min,
|
|
),
|
|
reverse=True,
|
|
)
|
|
|
|
return [compact(post) for score, post in scored_posts[:limit]]
|
|
|
|
|
|
def find_adjacent_posts(current_post_path):
|
|
"""Find previous and next posts in chronological order."""
|
|
from ..core.cache import get_blog_cache
|
|
|
|
blog_data = get_blog_cache() or {}
|
|
posts = blog_data.get("posts", [])
|
|
if not posts:
|
|
return {"prev": None, "next": None}
|
|
|
|
def normalize(path_value):
|
|
path_str = str(path_value).replace("\\", "/")
|
|
return path_str[2:] if path_str.startswith("./") else path_str
|
|
|
|
current_key = normalize(Path(current_post_path))
|
|
|
|
current_index = None
|
|
for idx, post in enumerate(posts):
|
|
file_path = post.get("file_path")
|
|
if file_path and normalize(file_path) == current_key:
|
|
current_index = idx
|
|
break
|
|
|
|
if current_index is None:
|
|
return {"prev": None, "next": None}
|
|
|
|
def compact_post(post):
|
|
if not post:
|
|
return None
|
|
return {
|
|
"title": post.get("title"),
|
|
"url": post.get("url") or post.get("path"),
|
|
"pub_date": post.get("pub_date"),
|
|
"date_str": post.get("date_str"),
|
|
"description": post.get("description"),
|
|
"unique_icon": post.get("unique_icon"),
|
|
}
|
|
|
|
prev_post = posts[current_index + 1] if current_index + 1 < len(posts) else None
|
|
next_post = posts[current_index - 1] if current_index - 1 >= 0 else None
|
|
|
|
return {"prev": compact_post(prev_post), "next": compact_post(next_post)}
|