mirror of
https://github.com/kennethreitz/kennethreitz.org.git
synced 2026-06-05 22:50:17 +00:00
a353ccfaa7
- Detect directory paths (ending with / or pointing to directories) - Use generate_folder_icon() for directories instead of regular article icons - Read index.md files for directory titles when available - Fallback to directory name for folder icon generation - Fixes AI personalities link and other directory links to show proper folder icons 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
3815 lines
159 KiB
Python
3815 lines
159 KiB
Python
# Enable gevent async I/O optimizations
|
|
import gevent
|
|
from gevent import monkey
|
|
monkey.patch_all() # Patch standard library for async I/O
|
|
from gevent.pool import Pool
|
|
|
|
import os
|
|
import mistune
|
|
from flask import Flask, render_template, abort, request, url_for, jsonify, redirect, Response
|
|
from pathlib import Path
|
|
import re
|
|
from datetime import datetime
|
|
from urllib.parse import quote
|
|
import json
|
|
import time
|
|
from xml.sax.saxutils import escape
|
|
import html
|
|
from collections import defaultdict
|
|
import hashlib
|
|
import base64
|
|
import math
|
|
from functools import lru_cache
|
|
|
|
app = Flask(__name__, template_folder='templates')
|
|
|
|
# Configuration
|
|
app.config['DISABLE_ANALYTICS'] = os.environ.get('DISABLE_ANALYTICS', 'false').lower() == 'true'
|
|
|
|
# Add custom Jinja2 filters
|
|
@app.template_filter('strftime')
|
|
def strftime_filter(date, fmt='%Y-%m-%d'):
|
|
"""Format a datetime object using strftime."""
|
|
if date is None:
|
|
return ''
|
|
if isinstance(date, str) and date.lower() == 'now':
|
|
date = datetime.now()
|
|
return date.strftime(fmt)
|
|
|
|
@app.template_filter('unescape')
|
|
def unescape_filter(text):
|
|
"""Unescape HTML entities in text."""
|
|
if text is None:
|
|
return ''
|
|
return html.unescape(text)
|
|
|
|
def _process_single_file(file_path):
|
|
"""Process a single file for cache generation. Returns data structure for the file."""
|
|
try:
|
|
full_path = Path(file_path)
|
|
|
|
# Read raw content directly for processing
|
|
with open(full_path, 'r', encoding='utf-8') as f:
|
|
raw_content = f.read()
|
|
|
|
# Get processed content data
|
|
content_data = render_markdown_file(full_path)
|
|
html_content = content_data['content']
|
|
|
|
result = {
|
|
'file_path': file_path,
|
|
'full_path': full_path,
|
|
'raw_content': raw_content,
|
|
'content_data': content_data,
|
|
'html_content': html_content,
|
|
'success': True
|
|
}
|
|
|
|
return result
|
|
except Exception as e:
|
|
return {
|
|
'file_path': file_path,
|
|
'error': str(e),
|
|
'success': False
|
|
}
|
|
|
|
|
|
def _generate_all_caches_unified():
|
|
"""Generate all caches in a single sweep through the data."""
|
|
import glob
|
|
from collections import defaultdict
|
|
import re
|
|
|
|
def simple_extract_excerpt(content, max_words=50):
|
|
"""Simple excerpt extraction for unified cache generation."""
|
|
# Remove front matter
|
|
content = re.sub(r'^---\s*\n.*?\n---\s*\n', '', content, flags=re.DOTALL)
|
|
# Remove title (first # line)
|
|
content = re.sub(r'^# .+?$', '', content, flags=re.MULTILINE)
|
|
# Remove date lines
|
|
content = re.sub(r'^\*[A-Za-z]+ \d{4}\*\s*$', '', content, flags=re.MULTILINE)
|
|
# Remove linked images ([](url))
|
|
content = re.sub(r'\[!\[[^\]]*\]\([^)]*\)\]\([^)]*\)', '', content)
|
|
# Remove standalone images
|
|
content = re.sub(r'!\[[^\]]*\]\([^)]*\)', '', content)
|
|
# Remove image references
|
|
content = re.sub(r'\[Image #\d+\]', '', content)
|
|
# Remove sidenotes (label + input + span structure)
|
|
content = re.sub(r'<label[^>]*class="margin-toggle sidenote-number"[^>]*></label><input[^>]*class="margin-toggle"[^>]*/>(<span class="sidenote">.*?</span>)', '', content, flags=re.DOTALL)
|
|
# Remove any remaining HTML tags
|
|
content = re.sub(r'<[^>]+>', '', content)
|
|
# Remove markdown links but keep the text
|
|
content = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', content)
|
|
# Remove markdown emphasis
|
|
content = re.sub(r'[*_`]', '', content)
|
|
# Get first meaningful line (skip empty and markdown-only lines)
|
|
lines = [line.strip() for line in content.split('\n') if line.strip()]
|
|
first_para = None
|
|
for line in lines:
|
|
# Skip header lines
|
|
if re.match(r'^#{1,6}\s', line):
|
|
continue
|
|
# Skip lines that are just punctuation or very short
|
|
if len(line) > 10:
|
|
first_para = line
|
|
break
|
|
|
|
if first_para:
|
|
words = first_para.split()[:max_words]
|
|
excerpt = ' '.join(words)
|
|
if len(words) == max_words:
|
|
excerpt += '...'
|
|
return excerpt
|
|
return ''
|
|
|
|
# Initialize all data structures
|
|
sidenotes_data = defaultdict(list)
|
|
outlines_data = defaultdict(list)
|
|
quotes_data = defaultdict(list)
|
|
connections_outgoing = defaultdict(list)
|
|
connections_incoming = defaultdict(list)
|
|
terms_data = defaultdict(list)
|
|
blog_posts = []
|
|
|
|
# Get all markdown files from /data/ directory
|
|
all_files = glob.glob('data/**/*.md', recursive=True)
|
|
all_files = [f for f in all_files if not f.endswith('index.md')]
|
|
|
|
print(f"Unified cache generation: Processing {len(all_files)} files...")
|
|
|
|
# Use gevent pool for parallel file processing
|
|
pool = Pool(20) # Process up to 20 files concurrently
|
|
file_results = pool.map(_process_single_file, all_files)
|
|
|
|
# Process results from parallel file processing
|
|
for result in file_results:
|
|
if not result['success']:
|
|
print(f"Error processing {result['file_path']}: {result['error']}")
|
|
continue
|
|
|
|
file_path = result['file_path']
|
|
full_path = result['full_path']
|
|
raw_content = result['raw_content']
|
|
content_data = result['content_data']
|
|
html_content = result['html_content']
|
|
|
|
# Generate blog post entry if this is an essay
|
|
if full_path.parent.name == 'essays':
|
|
# Use the robust extract_intelligent_date function
|
|
date_obj = extract_intelligent_date(full_path, content_data)
|
|
|
|
if date_obj is not None:
|
|
blog_posts.append({
|
|
'title': content_data['title'],
|
|
'path': f"/{full_path.relative_to(Path('data')).with_suffix('')}",
|
|
'url': f"/{full_path.relative_to(Path('data')).with_suffix('')}",
|
|
'file_path': str(full_path), # Add actual file path for mapping
|
|
'pub_date': date_obj,
|
|
'date_str': date_obj.strftime('%Y-%m-%d'),
|
|
'excerpt': simple_extract_excerpt(raw_content),
|
|
'description': simple_extract_excerpt(raw_content),
|
|
'word_count': len(raw_content.split()),
|
|
'category': full_path.parent.name,
|
|
'unique_icon': generate_unique_svg_icon(content_data['title'], size=24)
|
|
})
|
|
else:
|
|
print(f"DEBUG: Could not extract date from {full_path.name} in unified cache")
|
|
|
|
# Extract sidenotes with their IDs
|
|
# Pattern matches the full sidenote structure: input + span
|
|
sidenote_pattern = r'<input[^>]*id="([^"]*)"[^>]*class="margin-toggle"[^>]*/>.*?<span class="sidenote">(.*?)</span>'
|
|
sidenotes = re.findall(sidenote_pattern, html_content, re.DOTALL)
|
|
if sidenotes:
|
|
for sidenote_id, sidenote_content in sidenotes:
|
|
clean_sidenote = re.sub(r'<[^>]+>', '', sidenote_content).strip()
|
|
if clean_sidenote:
|
|
sidenotes_data[file_path].append({
|
|
'text': clean_sidenote,
|
|
'html': sidenote_content.strip(),
|
|
'id': sidenote_id
|
|
})
|
|
|
|
# Extract outlines (headings)
|
|
heading_pattern = r'(<h([1-6])[^>]*>.*?</h[1-6]>)'
|
|
headings = re.findall(heading_pattern, html_content)
|
|
if headings:
|
|
for full_tag, level in headings:
|
|
# Extract just the inner content for text
|
|
inner_pattern = r'<h[1-6][^>]*>(.*?)</h[1-6]>'
|
|
inner_match = re.search(inner_pattern, full_tag)
|
|
if inner_match:
|
|
clean_heading = re.sub(r'<[^>]+>', '', inner_match.group(1)).strip()
|
|
if clean_heading and not clean_heading.startswith('fn:'):
|
|
outlines_data[file_path].append({
|
|
'level': int(level),
|
|
'text': clean_heading,
|
|
'html': full_tag.strip()
|
|
})
|
|
|
|
# Extract quotes (blockquotes)
|
|
quote_pattern = r'<blockquote[^>]*>(.*?)</blockquote>'
|
|
quotes = re.findall(quote_pattern, html_content, re.DOTALL)
|
|
if quotes:
|
|
for quote in quotes:
|
|
clean_quote = re.sub(r'<[^>]+>', '', quote).strip()
|
|
if clean_quote:
|
|
quotes_data[file_path].append({
|
|
'text': clean_quote,
|
|
'html': quote.strip()
|
|
})
|
|
|
|
# Extract connections (cross-references)
|
|
connection_pattern = r'\[([^\]]+)\]\((/[^)]+)\)'
|
|
connections = re.findall(connection_pattern, raw_content)
|
|
if connections:
|
|
for link_text, link_url in connections:
|
|
# Include all internal links (starting with /) except external ones
|
|
if link_url.startswith('/') and not link_url.startswith('//'):
|
|
connections_outgoing[file_path].append({
|
|
'text': link_text,
|
|
'url': link_url,
|
|
'target_file': link_url
|
|
})
|
|
# Track incoming references
|
|
connections_incoming[link_url].append({
|
|
'text': link_text,
|
|
'source_file': file_path,
|
|
'context': link_text
|
|
})
|
|
|
|
# Extract terms for index
|
|
# Simple approach: extract words that appear in multiple files
|
|
words = re.findall(r'\b[A-Z][a-zA-Z]{3,}\b', raw_content)
|
|
for word in set(words):
|
|
if len(word) > 3 and word not in ['This', 'That', 'They', 'When', 'Where', 'What', 'Which']:
|
|
terms_data[word].append({
|
|
'file': file_path,
|
|
'context': word
|
|
})
|
|
|
|
# Sort blog posts by date (newest first)
|
|
blog_posts.sort(key=lambda x: x['pub_date'], reverse=True)
|
|
|
|
# Create URL and metadata mappings for terms processing
|
|
url_metadata = {}
|
|
file_to_url = {}
|
|
for post in blog_posts:
|
|
url_metadata[post['url']] = post
|
|
file_path = post.get('file_path') or post.get('path')
|
|
if file_path:
|
|
file_to_url[file_path] = post['url']
|
|
|
|
# Process terms to only include ones that appear in multiple files
|
|
filtered_terms = {term: refs for term, refs in terms_data.items() if len(refs) >= 2}
|
|
final_terms = {}
|
|
total_term_occurrences = 0
|
|
for term, refs in sorted(filtered_terms.items()):
|
|
# Convert refs to articles format expected by template
|
|
# Group by file to get counts per article
|
|
file_counts = {}
|
|
for ref in refs:
|
|
file_path = ref['file']
|
|
if file_path not in file_counts:
|
|
file_counts[file_path] = 0
|
|
file_counts[file_path] += 1
|
|
|
|
articles = []
|
|
for file_path, count in file_counts.items():
|
|
# Map file path to article URL and title
|
|
url = file_to_url.get(file_path, '')
|
|
if url:
|
|
metadata = url_metadata.get(url, {})
|
|
title = metadata.get('title', '')
|
|
if title: # Only include articles with valid titles
|
|
articles.append({
|
|
'url': url,
|
|
'title': title,
|
|
'count': count
|
|
})
|
|
|
|
if articles: # Only include terms that have valid articles
|
|
final_terms[term] = {
|
|
'articles': articles,
|
|
'total_count': sum(file_counts.values()),
|
|
'article_count': len(articles)
|
|
}
|
|
total_term_occurrences += sum(file_counts.values())
|
|
|
|
# Build final cache structures
|
|
total_sidenotes = sum(len(notes) for notes in sidenotes_data.values())
|
|
|
|
unified_cache = {
|
|
'blog_posts': blog_posts,
|
|
'sidenotes': {
|
|
'articles': dict(sidenotes_data),
|
|
'total_count': total_sidenotes
|
|
},
|
|
'outlines': {
|
|
'articles': dict(outlines_data),
|
|
'total_count': sum(len(headings) for headings in outlines_data.values())
|
|
},
|
|
'quotes': {
|
|
'articles': dict(quotes_data),
|
|
'total_count': sum(len(quotes) for quotes in quotes_data.values())
|
|
},
|
|
'connections': {
|
|
'outgoing_refs': dict(connections_outgoing),
|
|
'incoming_refs': dict(connections_incoming),
|
|
'total_count': sum(len(refs) for refs in connections_outgoing.values())
|
|
},
|
|
'terms': {
|
|
'terms': final_terms,
|
|
'total_occurrences': total_term_occurrences
|
|
}
|
|
}
|
|
|
|
return unified_cache
|
|
|
|
@app.context_processor
|
|
def inject_index_counts():
|
|
"""Make index counts available to all templates."""
|
|
try:
|
|
# Use optimized MetadataCache instead of old cached functions
|
|
sidenotes_data = metadata_cache.get_sidenotes()
|
|
outlines_data = metadata_cache.get_outlines()
|
|
quotes_data = metadata_cache.get_quotes()
|
|
connections_data = metadata_cache.get_connections()
|
|
terms_data = metadata_cache.get_terms()
|
|
|
|
return {
|
|
'index_counts': {
|
|
'sidenotes': sidenotes_data.get('total_count', 0),
|
|
'outlines': outlines_data.get('total_count', 0),
|
|
'quotes': quotes_data.get('total_count', 0),
|
|
'connections_outgoing': connections_data.get('total_outgoing', 0),
|
|
'connections_incoming': connections_data.get('total_incoming', 0),
|
|
'terms': terms_data.get('total_terms', 0),
|
|
'terms_total_refs': terms_data.get('total_occurrences', 0)
|
|
}
|
|
}
|
|
except Exception:
|
|
# Fallback to prevent template errors
|
|
return {
|
|
'index_counts': {
|
|
'sidenotes': 0,
|
|
'outlines': 0,
|
|
'quotes': 0,
|
|
'connections_outgoing': 0,
|
|
'connections_incoming': 0,
|
|
'terms': 0,
|
|
'terms_total_refs': 0
|
|
}
|
|
}
|
|
|
|
DATA_DIR = Path('data')
|
|
|
|
# Import the clean SVG icon generator
|
|
from svg_icon_generator import generate_unique_svg_icon
|
|
|
|
def generate_unique_svg_icon_OLD(title, size=24):
|
|
"""Generate a sophisticated unique SVG icon based on the title string."""
|
|
# Create multiple hashes for more entropy
|
|
hash_obj = hashlib.md5(title.encode())
|
|
hash_bytes = hash_obj.digest()
|
|
|
|
# Use SHA256 for additional entropy
|
|
sha_hash = hashlib.sha256(title.encode()).digest()
|
|
|
|
# Extract values from hash for various parameters
|
|
hue1 = (hash_bytes[0] * 360) // 256
|
|
hue2 = (hash_bytes[1] * 360) // 256
|
|
saturation = 50 + (hash_bytes[2] * 30) // 256 # 50-80% saturation
|
|
lightness = 40 + (hash_bytes[3] * 35) // 256 # 40-75% lightness
|
|
|
|
# Choose pattern type - expanded to 20 different patterns for much more diversity
|
|
pattern_type = hash_bytes[4] % 20
|
|
|
|
# Create gradient colors
|
|
color1 = f"hsl({hue1}, {saturation}%, {lightness}%)"
|
|
color2 = f"hsl({hue2}, {saturation + 10}%, {lightness + 15}%)"
|
|
|
|
# Generate gradient definition
|
|
gradient_angle = (sha_hash[0] * 360) // 256
|
|
gradient_id = f"grad_{abs(hash(title)) % 10000}"
|
|
|
|
shapes = []
|
|
defs = []
|
|
|
|
if pattern_type == 0: # Layered circles with gradients
|
|
defs.append(f'''<linearGradient id="{gradient_id}" x1="0%" y1="0%" x2="100%" y2="100%">
|
|
<stop offset="0%" stop-color="{color1}"/>
|
|
<stop offset="100%" stop-color="{color2}"/>
|
|
</linearGradient>''')
|
|
|
|
# Multiple concentric circles
|
|
for i in range(3):
|
|
radius = size // 3 - i * (size // 12)
|
|
opacity = 0.7 + i * 0.1
|
|
shapes.append(f'<circle cx="{size//2}" cy="{size//2}" r="{radius}" fill="url(#{gradient_id})" opacity="{opacity}"/>')
|
|
|
|
elif pattern_type == 1: # Flower of Life
|
|
defs.append(f'''<radialGradient id="{gradient_id}" cx="50%" cy="50%" r="50%">
|
|
<stop offset="0%" stop-color="{color1}"/>
|
|
<stop offset="100%" stop-color="{color2}"/>
|
|
</radialGradient>''')
|
|
|
|
# Sacred Flower of Life pattern - 6 surrounding circles around center
|
|
center_x, center_y = size // 2, size // 2
|
|
radius = size // 5
|
|
|
|
# Center circle
|
|
shapes.append(f'<circle cx="{center_x}" cy="{center_y}" r="{radius}" fill="none" stroke="url(#{gradient_id})" stroke-width="2" opacity="0.8"/>')
|
|
|
|
# Six surrounding circles
|
|
for i in range(6):
|
|
angle = (i * 60) * math.pi / 180
|
|
x = center_x + radius * math.cos(angle)
|
|
y = center_y + radius * math.sin(angle)
|
|
shapes.append(f'<circle cx="{x:.1f}" cy="{y:.1f}" r="{radius}" fill="none" stroke="url(#{gradient_id})" stroke-width="2" opacity="0.7"/>')
|
|
|
|
# Outer petals for extended flower
|
|
for i in range(12):
|
|
angle = (i * 30) * math.pi / 180
|
|
x = center_x + radius * 1.732 * math.cos(angle) # sqrt(3) spacing
|
|
y = center_y + radius * 1.732 * math.sin(angle)
|
|
shapes.append(f'<circle cx="{x:.1f}" cy="{y:.1f}" r="{radius//2}" fill="none" stroke="{color2}" stroke-width="1" opacity="0.5"/>')
|
|
|
|
elif pattern_type == 2: # Crystalline line art
|
|
defs.append(f'''<linearGradient id="{gradient_id}" x1="0%" y1="0%" x2="100%" y2="100%">
|
|
<stop offset="0%" stop-color="{color1}"/>
|
|
<stop offset="50%" stop-color="{color2}"/>
|
|
<stop offset="100%" stop-color="{color2}"/>
|
|
</linearGradient>''')
|
|
|
|
# Create elegant crystal structure in line art
|
|
center_x, center_y = size // 2, size // 2
|
|
points = []
|
|
for i in range(6):
|
|
angle = (i * 60) * math.pi / 180
|
|
x = center_x + (size // 3) * math.cos(angle)
|
|
y = center_y + (size // 3) * math.sin(angle)
|
|
points.append(f"{x:.1f},{y:.1f}")
|
|
|
|
# Main hexagonal outline with elegant stroke
|
|
shapes.append(f'<polygon points="{" ".join(points)}" fill="none" stroke="url(#{gradient_id})" stroke-width="2.5" opacity="0.8"/>')
|
|
|
|
# Inner crystalline structure with delicate lines
|
|
for i in range(6):
|
|
angle = (i * 60) * math.pi / 180
|
|
x = center_x + (size // 6) * math.cos(angle)
|
|
y = center_y + (size // 6) * math.sin(angle)
|
|
shapes.append(f'<line x1="{center_x}" y1="{center_y}" x2="{x:.1f}" y2="{y:.1f}" stroke="{color2}" stroke-width="1.5" opacity="0.6"/>')
|
|
|
|
# Central sacred point
|
|
shapes.append(f'<circle cx="{center_x}" cy="{center_y}" r="2" fill="none" stroke="{color1}" stroke-width="1.5" opacity="0.9"/>')
|
|
|
|
elif pattern_type == 3: # Flowing wave interference - line art
|
|
defs.append(f'''<linearGradient id="{gradient_id}" x1="0%" y1="50%" x2="100%" y2="50%">
|
|
<stop offset="0%" stop-color="{color1}"/>
|
|
<stop offset="30%" stop-color="{color2}"/>
|
|
<stop offset="70%" stop-color="{color2}"/>
|
|
<stop offset="100%" stop-color="{color1}"/>
|
|
</linearGradient>''')
|
|
|
|
# Create flowing wave-like paths with graceful curves
|
|
for wave in range(3):
|
|
path_data = f"M 0,{size//2}"
|
|
for x in range(0, size, 1):
|
|
frequency = 0.15 + wave * 0.08
|
|
amplitude = size // 8
|
|
phase_shift = wave * 1.5
|
|
y = size // 2 + amplitude * math.sin(x * frequency + phase_shift)
|
|
path_data += f" L {x},{y:.1f}"
|
|
|
|
stroke_width = 2.5 - wave * 0.5
|
|
opacity = 0.85 - wave * 0.15
|
|
shapes.append(f'<path d="{path_data}" stroke="url(#{gradient_id})" stroke-width="{stroke_width:.1f}" fill="none" opacity="{opacity:.2f}" stroke-linecap="round"/>')
|
|
|
|
elif pattern_type == 4: # Sacred Golden Ratio Spiral - refined line art
|
|
defs.append(f'''<linearGradient id="{gradient_id}" x1="20%" y1="20%" x2="80%" y2="80%">
|
|
<stop offset="0%" stop-color="{color2}"/>
|
|
<stop offset="40%" stop-color="{color2}"/>
|
|
<stop offset="100%" stop-color="{color1}"/>
|
|
</linearGradient>''')
|
|
|
|
# Sacred golden ratio spiral with elegant curves
|
|
center_x, center_y = size // 2, size // 2
|
|
golden_ratio = 1.618033988749
|
|
|
|
# Create smooth logarithmic spiral based on golden ratio
|
|
path_data = f"M {center_x},{center_y}"
|
|
for t in range(0, 400, 2): # Smoother curve with more points
|
|
angle = t * math.pi / 180
|
|
# Golden ratio growth with refined scaling
|
|
radius = (size // 10) * math.pow(golden_ratio, angle / (math.pi / 1.8))
|
|
if radius > size // 2 - 2:
|
|
break
|
|
x = center_x + radius * math.cos(angle)
|
|
y = center_y + radius * math.sin(angle)
|
|
path_data += f" L {x:.1f},{y:.1f}"
|
|
|
|
shapes.append(f'<path d="{path_data}" stroke="url(#{gradient_id})" stroke-width="3" fill="none" opacity="0.85" stroke-linecap="round"/>')
|
|
|
|
# Subtle Fibonacci rectangle outlines
|
|
fib_sizes = [2, 3, 5, 8]
|
|
for i, fib in enumerate(fib_sizes):
|
|
if fib * 2 > size // 4:
|
|
break
|
|
square_size = fib * 2
|
|
x = center_x - square_size // 2 + i * 1.5
|
|
y = center_y - square_size // 2 + i * 1.5
|
|
opacity = 0.4 - i * 0.08
|
|
shapes.append(f'<rect x="{x}" y="{y}" width="{square_size}" height="{square_size}" fill="none" stroke="{color2}" stroke-width="1" opacity="{opacity:.2f}"/>')
|
|
|
|
# Sacred center - golden ratio point
|
|
shapes.append(f'<circle cx="{center_x}" cy="{center_y}" r="1.5" fill="none" stroke="{color1}" stroke-width="2" opacity="0.9"/>')
|
|
|
|
elif pattern_type == 5: # Tessellation pattern
|
|
defs.append(f'''<pattern id="{gradient_id}" x="0" y="0" width="8" height="8" patternUnits="userSpaceOnUse">
|
|
<rect width="8" height="8" fill="{color1}"/>
|
|
<circle cx="4" cy="4" r="2" fill="{color2}" opacity="0.7"/>
|
|
</pattern>''')
|
|
|
|
# Create tessellated hexagon
|
|
points = []
|
|
for i in range(6):
|
|
angle = (i * 60) * 3.14159 / 180
|
|
x = size // 2 + (size // 2.5) * math.cos(angle)
|
|
y = size // 2 + (size // 2.5) * math.sin(angle)
|
|
points.append(f"{x:.1f},{y:.1f}")
|
|
|
|
shapes.append(f'<polygon points="{" ".join(points)}" fill="url(#{gradient_id})" stroke="{color2}" stroke-width="1"/>')
|
|
|
|
elif pattern_type == 6: # Fractal tree
|
|
defs.append(f'''<linearGradient id="{gradient_id}" x1="0%" y1="100%" x2="0%" y2="0%">
|
|
<stop offset="0%" stop-color="{color1}"/>
|
|
<stop offset="100%" stop-color="{color2}"/>
|
|
</linearGradient>''')
|
|
|
|
def draw_branch(x, y, angle, length, depth):
|
|
if depth == 0 or length < 2:
|
|
return []
|
|
end_x = x + length * math.cos(angle)
|
|
end_y = y + length * math.sin(angle)
|
|
branches = [f'<line x1="{x:.1f}" y1="{y:.1f}" x2="{end_x:.1f}" y2="{end_y:.1f}" stroke="url(#{gradient_id})" stroke-width="{depth}" opacity="{0.8 if depth > 1 else 0.6}"/>']
|
|
branches.extend(draw_branch(end_x, end_y, angle - 0.5, length * 0.7, depth - 1))
|
|
branches.extend(draw_branch(end_x, end_y, angle + 0.5, length * 0.7, depth - 1))
|
|
return branches
|
|
|
|
shapes.extend(draw_branch(size//2, size*0.9, -math.pi/2, size//3, 4))
|
|
|
|
elif pattern_type == 7: # Dot matrix
|
|
dot_size = size // 12
|
|
spacing = size // 6
|
|
for x in range(spacing, size - spacing + 1, spacing):
|
|
for y in range(spacing, size - spacing + 1, spacing):
|
|
opacity = 0.4 + (hash(f"{x},{y}") % 6) * 0.1
|
|
color = color1 if (x + y) % 2 == 0 else color2
|
|
shapes.append(f'<circle cx="{x}" cy="{y}" r="{dot_size}" fill="{color}" opacity="{opacity}"/>')
|
|
|
|
elif pattern_type == 8: # Triangular mosaic
|
|
defs.append(f'''<linearGradient id="{gradient_id}" x1="0%" y1="0%" x2="100%" y2="100%">
|
|
<stop offset="0%" stop-color="{color1}"/>
|
|
<stop offset="50%" stop-color="{color2}"/>
|
|
<stop offset="100%" stop-color="{color1}"/>
|
|
</linearGradient>''')
|
|
|
|
# Create triangular pattern
|
|
tri_size = size // 3
|
|
for i in range(3):
|
|
for j in range(3):
|
|
x = j * tri_size
|
|
y = i * tri_size
|
|
if (i + j) % 2 == 0:
|
|
shapes.append(f'<polygon points="{x},{y} {x+tri_size},{y} {x+tri_size//2},{y+tri_size}" fill="url(#{gradient_id})" opacity="0.8"/>')
|
|
else:
|
|
shapes.append(f'<polygon points="{x},{y+tri_size} {x+tri_size},{y+tri_size} {x+tri_size//2},{y}" fill="{color2}" opacity="0.6"/>')
|
|
|
|
elif pattern_type == 9: # Organic bubbles
|
|
defs.append(f'''<radialGradient id="{gradient_id}" cx="30%" cy="30%" r="70%">
|
|
<stop offset="0%" stop-color="{color2}"/>
|
|
<stop offset="100%" stop-color="{color1}"/>
|
|
</radialGradient>''')
|
|
|
|
# Create organic bubble pattern
|
|
bubble_positions = [
|
|
(size * 0.3, size * 0.25, size // 6),
|
|
(size * 0.7, size * 0.4, size // 8),
|
|
(size * 0.5, size * 0.7, size // 5),
|
|
(size * 0.2, size * 0.6, size // 10),
|
|
(size * 0.8, size * 0.8, size // 7),
|
|
(size * 0.6, size * 0.2, size // 9)
|
|
]
|
|
|
|
for i, (x, y, radius) in enumerate(bubble_positions):
|
|
opacity = 0.7 - (i % 3) * 0.15
|
|
bubble_color = f"url(#{gradient_id})" if i % 2 == 0 else color2
|
|
shapes.append(f'<circle cx="{x:.1f}" cy="{y:.1f}" r="{radius}" fill="{bubble_color}" opacity="{opacity}"/>')
|
|
|
|
elif pattern_type == 10: # Metatron's Cube
|
|
defs.append(f'''<linearGradient id="{gradient_id}" x1="0%" y1="0%" x2="100%" y2="100%">
|
|
<stop offset="0%" stop-color="{color1}"/>
|
|
<stop offset="100%" stop-color="{color2}"/>
|
|
</radialGradient>''')
|
|
|
|
# Sacred Metatron's Cube - 13 circles of creation
|
|
center_x, center_y = size // 2, size // 2
|
|
radius = size // 8
|
|
|
|
# Center circle
|
|
shapes.append(f'<circle cx="{center_x}" cy="{center_y}" r="{radius//2}" fill="url(#{gradient_id})" opacity="0.9"/>')
|
|
|
|
# Inner 6 circles (hexagonal pattern)
|
|
for i in range(6):
|
|
angle = (i * 60) * math.pi / 180
|
|
x = center_x + radius * math.cos(angle)
|
|
y = center_y + radius * math.sin(angle)
|
|
shapes.append(f'<circle cx="{x:.1f}" cy="{y:.1f}" r="{radius//3}" fill="none" stroke="url(#{gradient_id})" stroke-width="2" opacity="0.8"/>')
|
|
|
|
# Outer 6 circles
|
|
for i in range(6):
|
|
angle = (i * 60) * math.pi / 180
|
|
x = center_x + radius * 2 * math.cos(angle)
|
|
y = center_y + radius * 2 * math.sin(angle)
|
|
shapes.append(f'<circle cx="{x:.1f}" cy="{y:.1f}" r="{radius//4}" fill="none" stroke="{color2}" stroke-width="1" opacity="0.6"/>')
|
|
|
|
# Connect with sacred lines (Fruit of Life pattern)
|
|
for i in range(6):
|
|
angle1 = (i * 60) * math.pi / 180
|
|
angle2 = ((i + 1) * 60) * math.pi / 180
|
|
x1 = center_x + radius * math.cos(angle1)
|
|
y1 = center_y + radius * math.sin(angle1)
|
|
x2 = center_x + radius * math.cos(angle2)
|
|
y2 = center_y + radius * math.sin(angle2)
|
|
shapes.append(f'<line x1="{x1:.1f}" y1="{y1:.1f}" x2="{x2:.1f}" y2="{y2:.1f}" stroke="{color2}" stroke-width="1" opacity="0.4"/>')
|
|
|
|
elif pattern_type == 11: # Flower petals
|
|
defs.append(f'''<radialGradient id="{gradient_id}" cx="50%" cy="50%" r="50%">
|
|
<stop offset="0%" stop-color="{color2}"/>
|
|
<stop offset="100%" stop-color="{color1}"/>
|
|
</radialGradient>''')
|
|
|
|
num_petals = 6 + (hash_bytes[6] % 6) # 6-12 petals
|
|
for i in range(num_petals):
|
|
angle = (i * 360 / num_petals) * math.pi / 180
|
|
x = size // 2 + (size // 3) * math.cos(angle)
|
|
y = size // 2 + (size // 3) * math.sin(angle)
|
|
shapes.append(f'<ellipse cx="{x:.1f}" cy="{y:.1f}" rx="{size//8}" ry="{size//6}" fill="url(#{gradient_id})" opacity="0.8" transform="rotate({i * 360 / num_petals} {x:.1f} {y:.1f})"/>')
|
|
|
|
# Center
|
|
shapes.append(f'<circle cx="{size//2}" cy="{size//2}" r="{size//10}" fill="{color1}"/>')
|
|
|
|
elif pattern_type == 12: # Diamond lattice
|
|
diamond_size = size // 6
|
|
for x in range(diamond_size, size, diamond_size * 2):
|
|
for y in range(diamond_size, size, diamond_size * 2):
|
|
points = [
|
|
f"{x},{y - diamond_size//2}",
|
|
f"{x + diamond_size//2},{y}",
|
|
f"{x},{y + diamond_size//2}",
|
|
f"{x - diamond_size//2},{y}"
|
|
]
|
|
color = color1 if (x + y) % 4 == 0 else color2
|
|
shapes.append(f'<polygon points="{" ".join(points)}" fill="{color}" opacity="0.7"/>')
|
|
|
|
elif pattern_type == 13: # Sine wave pattern
|
|
defs.append(f'''<linearGradient id="{gradient_id}" x1="0%" y1="0%" x2="0%" y2="100%">
|
|
<stop offset="0%" stop-color="{color1}"/>
|
|
<stop offset="100%" stop-color="{color2}"/>
|
|
</linearGradient>''')
|
|
|
|
for wave in range(5):
|
|
path_data = f"M 0,{size//2}"
|
|
for x in range(0, size, 2):
|
|
frequency = 0.3 + wave * 0.1
|
|
amplitude = size // 8
|
|
phase = wave * math.pi / 3
|
|
y = size // 2 + amplitude * math.sin(x * frequency + phase)
|
|
path_data += f" L {x},{y:.1f}"
|
|
shapes.append(f'<path d="{path_data}" stroke="url(#{gradient_id})" stroke-width="2" fill="none" opacity="{0.9 - wave * 0.15}"/>')
|
|
|
|
elif pattern_type == 14: # Hexagonal grid
|
|
hex_size = size // 8
|
|
for row in range(4):
|
|
for col in range(4):
|
|
x = col * hex_size * 1.5 + (row % 2) * hex_size * 0.75
|
|
y = row * hex_size * 0.866
|
|
if x < size and y < size:
|
|
points = []
|
|
for i in range(6):
|
|
angle = (i * 60) * math.pi / 180
|
|
px = x + hex_size * math.cos(angle)
|
|
py = y + hex_size * math.sin(angle)
|
|
points.append(f"{px:.1f},{py:.1f}")
|
|
color = color1 if (row + col) % 2 == 0 else color2
|
|
shapes.append(f'<polygon points="{" ".join(points)}" fill="{color}" opacity="0.6" stroke="{color2}" stroke-width="1"/>')
|
|
|
|
elif pattern_type == 15: # Sri Yantra
|
|
defs.append(f'''<radialGradient id="{gradient_id}" cx="50%" cy="50%" r="50%">
|
|
<stop offset="0%" stop-color="{color1}"/>
|
|
<stop offset="100%" stop-color="{color2}"/>
|
|
</radialGradient>''')
|
|
|
|
# Sacred Sri Yantra - 9 interlocking triangles
|
|
center_x, center_y = size // 2, size // 2
|
|
outer_radius = size // 2.5
|
|
|
|
# 4 upward pointing triangles (Shiva)
|
|
for i in range(4):
|
|
scale = 1 - i * 0.2
|
|
triangle_size = outer_radius * scale
|
|
|
|
# Calculate triangle points
|
|
x1 = center_x
|
|
y1 = center_y - triangle_size
|
|
x2 = center_x - triangle_size * 0.866 # sin(60°)
|
|
y2 = center_y + triangle_size * 0.5
|
|
x3 = center_x + triangle_size * 0.866
|
|
y3 = center_y + triangle_size * 0.5
|
|
|
|
opacity = 0.7 - i * 0.1
|
|
shapes.append(f'<polygon points="{x1:.1f},{y1:.1f} {x2:.1f},{y2:.1f} {x3:.1f},{y3:.1f}" fill="none" stroke="url(#{gradient_id})" stroke-width="2" opacity="{opacity}"/>')
|
|
|
|
# 5 downward pointing triangles (Shakti)
|
|
for i in range(5):
|
|
scale = 0.9 - i * 0.15
|
|
triangle_size = outer_radius * scale
|
|
rotation = i * 8 # Slight rotation for interlocking effect
|
|
|
|
# Calculate inverted triangle points
|
|
x1 = center_x
|
|
y1 = center_y + triangle_size
|
|
x2 = center_x - triangle_size * 0.866
|
|
y2 = center_y - triangle_size * 0.5
|
|
x3 = center_x + triangle_size * 0.866
|
|
y3 = center_y - triangle_size * 0.5
|
|
|
|
opacity = 0.6 - i * 0.08
|
|
shapes.append(f'<polygon points="{x1:.1f},{y1:.1f} {x2:.1f},{y2:.1f} {x3:.1f},{y3:.1f}" fill="none" stroke="{color2}" stroke-width="1" opacity="{opacity}"/>')
|
|
|
|
# Central bindu (divine point)
|
|
shapes.append(f'<circle cx="{center_x}" cy="{center_y}" r="{size//20}" fill="{color1}" opacity="0.9"/>')
|
|
|
|
# Outer protective circles
|
|
shapes.append(f'<circle cx="{center_x}" cy="{center_y}" r="{outer_radius * 1.1}" fill="none" stroke="{color2}" stroke-width="1" opacity="0.5"/>')
|
|
shapes.append(f'<circle cx="{center_x}" cy="{center_y}" r="{outer_radius * 1.25}" fill="none" stroke="{color1}" stroke-width="1" opacity="0.3"/>')
|
|
|
|
elif pattern_type == 16: # Mosaic tiles
|
|
tile_size = size // 5
|
|
for x in range(0, size, tile_size):
|
|
for y in range(0, size, tile_size):
|
|
# Random tile pattern based on position
|
|
tile_hash = hash(f"{x}-{y}-{title}") % 4
|
|
if tile_hash == 0:
|
|
shapes.append(f'<rect x="{x}" y="{y}" width="{tile_size}" height="{tile_size}" fill="{color1}" opacity="0.8"/>')
|
|
elif tile_hash == 1:
|
|
shapes.append(f'<circle cx="{x + tile_size//2}" cy="{y + tile_size//2}" r="{tile_size//3}" fill="{color2}" opacity="0.7"/>')
|
|
elif tile_hash == 2:
|
|
points = f"{x},{y+tile_size} {x+tile_size//2},{y} {x+tile_size},{y+tile_size}"
|
|
shapes.append(f'<polygon points="{points}" fill="{color1}" opacity="0.6"/>')
|
|
else:
|
|
shapes.append(f'<rect x="{x}" y="{y}" width="{tile_size}" height="{tile_size}" fill="{color2}" opacity="0.5" rx="{tile_size//4}"/>')
|
|
|
|
elif pattern_type == 17: # Orbital rings
|
|
defs.append(f'''<linearGradient id="{gradient_id}" x1="0%" y1="0%" x2="100%" y2="100%">
|
|
<stop offset="0%" stop-color="{color1}"/>
|
|
<stop offset="100%" stop-color="{color2}"/>
|
|
</linearGradient>''')
|
|
|
|
for i in range(4):
|
|
radius = size // 6 + i * size // 12
|
|
rotation = i * 45
|
|
shapes.append(f'<circle cx="{size//2}" cy="{size//2}" r="{radius}" fill="none" stroke="url(#{gradient_id})" stroke-width="2" opacity="{0.8 - i*0.15}" transform="rotate({rotation} {size//2} {size//2})"/>')
|
|
# Small planet
|
|
planet_x = size // 2 + radius
|
|
planet_y = size // 2
|
|
shapes.append(f'<circle cx="{planet_x}" cy="{planet_y}" r="3" fill="{color2}" transform="rotate({rotation} {size//2} {size//2})"/>')
|
|
|
|
elif pattern_type == 18: # Woven pattern
|
|
defs.append(f'''<pattern id="{gradient_id}" x="0" y="0" width="6" height="6" patternUnits="userSpaceOnUse">
|
|
<rect width="6" height="6" fill="{color1}"/>
|
|
<rect x="0" y="0" width="3" height="3" fill="{color2}"/>
|
|
<rect x="3" y="3" width="3" height="3" fill="{color2}"/>
|
|
</pattern>''')
|
|
|
|
# Create woven effect with overlapping rectangles
|
|
for i in range(6):
|
|
x = i * size // 6
|
|
shapes.append(f'<rect x="{x}" y="0" width="{size//12}" height="{size}" fill="url(#{gradient_id})" opacity="0.7"/>')
|
|
shapes.append(f'<rect x="0" y="{x}" width="{size}" height="{size//12}" fill="{color2}" opacity="0.5"/>')
|
|
|
|
else: # pattern_type == 19: Platonic Tetrahedron
|
|
defs.append(f'''<linearGradient id="{gradient_id}" x1="0%" y1="0%" x2="100%" y2="100%">
|
|
<stop offset="0%" stop-color="{color1}"/>
|
|
<stop offset="50%" stop-color="{color2}"/>
|
|
<stop offset="100%" stop-color="{color1}"/>
|
|
</radialGradient>''')
|
|
|
|
# Sacred Tetrahedron - representing Fire element and divine trinity
|
|
center_x, center_y = size // 2, size // 2
|
|
tet_size = size // 2.8
|
|
|
|
# Main large triangle (upward - divine masculine)
|
|
x1 = center_x
|
|
y1 = center_y - tet_size * 0.7
|
|
x2 = center_x - tet_size * 0.866
|
|
y2 = center_y + tet_size * 0.5
|
|
x3 = center_x + tet_size * 0.866
|
|
y3 = center_y + tet_size * 0.5
|
|
|
|
shapes.append(f'<polygon points="{x1:.1f},{y1:.1f} {x2:.1f},{y2:.1f} {x3:.1f},{y3:.1f}" fill="none" stroke="url(#{gradient_id})" stroke-width="3" opacity="0.9"/>')
|
|
|
|
# Inverted triangle (downward - divine feminine)
|
|
y1_inv = center_y + tet_size * 0.4
|
|
y2_inv = center_y - tet_size * 0.3
|
|
y3_inv = center_y - tet_size * 0.3
|
|
x2_inv = center_x - tet_size * 0.5
|
|
x3_inv = center_x + tet_size * 0.5
|
|
|
|
shapes.append(f'<polygon points="{center_x:.1f},{y1_inv:.1f} {x2_inv:.1f},{y2_inv:.1f} {x3_inv:.1f},{y3_inv:.1f}" fill="none" stroke="{color2}" stroke-width="2" opacity="0.8"/>')
|
|
|
|
# Inner sacred triangles (tetraktys pattern)
|
|
for i in range(3):
|
|
scale = 0.6 - i * 0.15
|
|
inner_size = tet_size * scale
|
|
x1_i = center_x
|
|
y1_i = center_y - inner_size * 0.4
|
|
x2_i = center_x - inner_size * 0.5
|
|
y2_i = center_y + inner_size * 0.2
|
|
x3_i = center_x + inner_size * 0.5
|
|
y3_i = center_y + inner_size * 0.2
|
|
|
|
opacity = 0.7 - i * 0.15
|
|
shapes.append(f'<polygon points="{x1_i:.1f},{y1_i:.1f} {x2_i:.1f},{y2_i:.1f} {x3_i:.1f},{y3_i:.1f}" fill="none" stroke="{color1}" stroke-width="1" opacity="{opacity}"/>')
|
|
|
|
# Central point of unity
|
|
shapes.append(f'<circle cx="{center_x}" cy="{center_y}" r="{size//25}" fill="{color1}" opacity="1.0"/>')
|
|
|
|
# Corner vertices (tetraktys dots)
|
|
vertex_radius = size // 30
|
|
shapes.append(f'<circle cx="{x1:.1f}" cy="{y1:.1f}" r="{vertex_radius}" fill="{color2}" opacity="0.8"/>')
|
|
shapes.append(f'<circle cx="{x2:.1f}" cy="{y2:.1f}" r="{vertex_radius}" fill="{color2}" opacity="0.8"/>')
|
|
shapes.append(f'<circle cx="{x3:.1f}" cy="{y3:.1f}" r="{vertex_radius}" fill="{color2}" opacity="0.8"/>')
|
|
|
|
# Compose SVG
|
|
defs_content = "\n ".join(defs) if defs else ""
|
|
shapes_content = "\n ".join(shapes)
|
|
|
|
svg = f'''<svg width="{size}" height="{size}" viewBox="0 0 {size} {size}" xmlns="http://www.w3.org/2000/svg">
|
|
<defs>
|
|
{defs_content}
|
|
</defs>
|
|
{shapes_content}
|
|
</svg>'''
|
|
|
|
# Convert to data URL
|
|
svg_b64 = base64.b64encode(svg.encode()).decode()
|
|
return f"data:image/svg+xml;base64,{svg_b64}"
|
|
|
|
@lru_cache(maxsize=1000)
|
|
def get_cached_markdown_title(file_path):
|
|
"""Extract and cache the H1 title from a markdown file for performance."""
|
|
try:
|
|
# Convert Path object to string for caching compatibility
|
|
file_path_str = str(file_path)
|
|
|
|
# Quick check - if file was modified recently, we might want to skip cache
|
|
# For now, let's just extract title efficiently
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
content = f.read(1000) # Only read first 1000 chars to find title
|
|
|
|
# Look for first H1 markdown title
|
|
title_match = re.search(r'^#\s+(.+?)$', content, re.MULTILINE)
|
|
if title_match:
|
|
return title_match.group(1).strip()
|
|
|
|
# Fallback: look for HTML H1 if markdown was already rendered
|
|
title_match = re.search(r'<h1[^>]*>(.*?)</h1>', content, re.IGNORECASE)
|
|
if title_match:
|
|
# Remove HTML tags from title
|
|
title = re.sub(r'<[^>]+>', '', title_match.group(1))
|
|
return html.unescape(title).strip()
|
|
|
|
return None
|
|
except:
|
|
return None
|
|
|
|
@lru_cache(maxsize=500)
|
|
def generate_folder_icon(title, size=24):
|
|
"""Generate a folder icon with unique accent color based on title."""
|
|
hash_obj = hashlib.md5(title.encode())
|
|
hash_bytes = hash_obj.digest()
|
|
|
|
# Generate accent color
|
|
hue = (hash_bytes[0] * 360) // 256
|
|
saturation = 60 + (hash_bytes[1] * 20) // 256 # 60-80%
|
|
lightness = 45 + (hash_bytes[2] * 20) // 256 # 45-65%
|
|
|
|
accent_color = f"hsl({hue}, {saturation}%, {lightness}%)"
|
|
folder_base = "#e8e8e8"
|
|
|
|
svg = f'''<svg width="{size}" height="{size}" viewBox="0 0 24 24" xmlns="http://www.w3.org/2000/svg">
|
|
<defs>
|
|
<linearGradient id="folder_grad_{abs(hash(title)) % 1000}" x1="0%" y1="0%" x2="100%" y2="100%">
|
|
<stop offset="0%" stop-color="{folder_base}"/>
|
|
<stop offset="100%" stop-color="{accent_color}"/>
|
|
</linearGradient>
|
|
</defs>
|
|
<path d="M10 4H4c-1.11 0-2 .89-2 2v12c0 1.11.89 2 2 2h16c1.11 0 2-.89 2-2V8c0-1.11-.89-2-2-2h-8l-2-2z"
|
|
fill="url(#folder_grad_{abs(hash(title)) % 1000})"
|
|
stroke="{accent_color}"
|
|
stroke-width="0.5"/>
|
|
<circle cx="18" cy="7" r="2" fill="{accent_color}" opacity="0.8"/>
|
|
</svg>'''
|
|
|
|
svg_b64 = base64.b64encode(svg.encode()).decode()
|
|
return f"data:image/svg+xml;base64,{svg_b64}"
|
|
|
|
def get_directory_structure(path):
|
|
"""Get the directory structure for a given path."""
|
|
items = []
|
|
if not path.exists() or not path.is_dir():
|
|
return items
|
|
|
|
# Separate directories and files for better organization
|
|
dirs = []
|
|
files = []
|
|
|
|
for item in sorted(path.iterdir(), reverse=True):
|
|
if item.name.startswith('.') or item.name.lower() == 'index.md' or item.name.endswith('.bak'):
|
|
continue
|
|
|
|
# Create display name without extension for files
|
|
display_name = item.stem if item.is_file() and item.suffix else item.name
|
|
display_name = display_name.replace('-', ' ').replace('_', ' ').title()
|
|
|
|
# Create clean URL path without .md extension
|
|
if item.is_dir():
|
|
url_path = '/' + str(item.relative_to(DATA_DIR)) + '/'
|
|
elif item.suffix == '.md':
|
|
# Remove .md extension for clean URLs
|
|
relative_path = str(item.relative_to(DATA_DIR))
|
|
url_path = '/' + relative_path[:-3] # Remove .md extension
|
|
else:
|
|
url_path = '/' + str(item.relative_to(DATA_DIR))
|
|
|
|
# Extract date from markdown files
|
|
file_date = None
|
|
if item.is_file() and item.suffix == '.md':
|
|
try:
|
|
with open(item, 'r', encoding='utf-8') as f:
|
|
# Read first few lines to find date
|
|
for i, line in enumerate(f):
|
|
if i > 10: # Only check first 10 lines
|
|
break
|
|
# Look for date patterns like *January 2009* or *2014*
|
|
date_match = re.match(r'^\*([A-Za-z]+ \d{4}|\d{4})\*\s*$', line.strip())
|
|
if date_match:
|
|
file_date = date_match.group(1)
|
|
break
|
|
except:
|
|
pass
|
|
|
|
# Generate unique SVG icon based on actual content title for consistency
|
|
icon_title = display_name # Default to filename-based display name
|
|
|
|
# For markdown files, try to extract the actual H1 title from content
|
|
if item.is_file() and item.suffix == '.md':
|
|
try:
|
|
# Use cached title extraction for performance
|
|
cached_title = get_cached_markdown_title(item)
|
|
if cached_title:
|
|
icon_title = cached_title
|
|
# Also update display_name to use the actual title
|
|
display_name = cached_title
|
|
except:
|
|
# Fallback to filename-based display name if parsing fails
|
|
pass
|
|
|
|
if item.is_dir():
|
|
unique_icon = generate_folder_icon(icon_title, size=32)
|
|
else:
|
|
unique_icon = generate_unique_svg_icon(icon_title, size=32)
|
|
|
|
item_info = {
|
|
'name': item.name,
|
|
'display_name': display_name,
|
|
'path': str(item.relative_to(DATA_DIR)),
|
|
'url_path': url_path,
|
|
'is_dir': item.is_dir(),
|
|
'is_markdown': item.suffix == '.md',
|
|
'is_image': item.suffix.lower() in ['.jpg', '.jpeg', '.png', '.gif', '.webp'],
|
|
'size': item.stat().st_size if item.is_file() else None,
|
|
'created': datetime.fromtimestamp(item.stat().st_ctime),
|
|
'modified': datetime.fromtimestamp(item.stat().st_mtime),
|
|
'file_date': file_date, # Date extracted from file content
|
|
'file_type': item.suffix.lower() if item.is_file() else 'directory',
|
|
'static_path': f"/static/data/{item.relative_to(DATA_DIR)}" if not item.is_dir() else None,
|
|
'unique_icon': unique_icon # Generated SVG icon
|
|
}
|
|
|
|
if item.is_dir():
|
|
dirs.append(item_info)
|
|
else:
|
|
files.append(item_info)
|
|
|
|
# Return directories first, then files
|
|
return dirs + files
|
|
|
|
|
|
|
|
def calculate_reading_time(text):
|
|
"""Calculate estimated reading time based on word count."""
|
|
# Remove HTML tags for more accurate word count
|
|
clean_text = re.sub(r'<[^>]+>', '', text)
|
|
# Average reading speed is 200-250 words per minute, using 225 as middle ground
|
|
word_count = len(clean_text.split())
|
|
reading_time = max(1, round(word_count / 225)) # Minimum 1 minute
|
|
return reading_time, word_count
|
|
|
|
|
|
|
|
def find_series_posts(metadata, current_path):
|
|
"""Find all posts in the same series as the current post."""
|
|
series_posts = []
|
|
if not metadata.get('series'):
|
|
return series_posts
|
|
|
|
series_name = metadata['series']
|
|
|
|
# Search through all markdown files to find posts in the same series
|
|
for root, dirs, files in os.walk(DATA_DIR):
|
|
for file in files:
|
|
if file.endswith('.md') and file != 'index.md':
|
|
file_path = Path(root) / file
|
|
|
|
# Skip the current file
|
|
if str(file_path) == str(current_path):
|
|
continue
|
|
|
|
try:
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
|
|
# Extract metadata
|
|
yaml_pattern = r'^---\s*\n(.*?)\n---\s*\n'
|
|
yaml_match = re.match(yaml_pattern, content, re.DOTALL)
|
|
if yaml_match:
|
|
import yaml
|
|
post_metadata = yaml.safe_load(yaml_match.group(1)) or {}
|
|
|
|
if post_metadata.get('series') == series_name:
|
|
# Create URL path for this post
|
|
relative_path = str(file_path.relative_to(DATA_DIR))
|
|
url_path = '/' + relative_path[:-3] # Remove .md
|
|
|
|
# Get title from metadata or filename
|
|
title = post_metadata.get('title') or file_path.stem.replace('-', ' ').title()
|
|
|
|
series_posts.append({
|
|
'title': title,
|
|
'url': url_path,
|
|
'order': post_metadata.get('series_order', 999),
|
|
'description': post_metadata.get('description', '')
|
|
})
|
|
except:
|
|
continue
|
|
|
|
# Sort by series_order
|
|
series_posts.sort(key=lambda x: x['order'])
|
|
return series_posts
|
|
|
|
def extract_tags_from_content(content, metadata, file_path):
|
|
"""Extract tags from content and metadata for categorization."""
|
|
tags = set()
|
|
|
|
# Only use explicitly defined tags from YAML front matter
|
|
if metadata.get('tags'):
|
|
if isinstance(metadata['tags'], list):
|
|
tags.update(tag.lower().strip() for tag in metadata['tags'])
|
|
else:
|
|
tags.update(tag.lower().strip() for tag in str(metadata['tags']).split(','))
|
|
|
|
return list(tags)
|
|
|
|
|
|
def render_markdown_file(file_path):
|
|
"""Render a markdown file to HTML."""
|
|
try:
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
|
|
# Extract YAML front matter if it exists
|
|
metadata = {}
|
|
yaml_pattern = r'^---\s*\n(.*?)\n---\s*\n'
|
|
yaml_match = re.match(yaml_pattern, content, re.DOTALL)
|
|
if yaml_match:
|
|
try:
|
|
import yaml
|
|
metadata = yaml.safe_load(yaml_match.group(1)) or {}
|
|
content = content[yaml_match.end():]
|
|
except:
|
|
pass
|
|
|
|
# Extract first h1 header if it exists
|
|
first_h1 = None
|
|
# Look for the first H1 at the start of the file (must be on first line or after blank line)
|
|
h1_match = re.search(r'^# (.+?)$', content, re.MULTILINE)
|
|
if h1_match:
|
|
first_h1 = h1_match.group(1).strip()
|
|
# Remove only the first h1 line from content to avoid duplication
|
|
content = re.sub(r'^# .+?$', '', content, count=1, flags=re.MULTILINE)
|
|
|
|
# Extract date from italic date pattern (e.g., "*August 2025*")
|
|
# Only match dates that look like month/year patterns, not quotes or long text
|
|
date_match = re.search(r'^\*([A-Za-z]+ \d{4}|\d{4})\*\s*$', content, re.MULTILINE)
|
|
if date_match and not metadata.get('date'):
|
|
date_text = date_match.group(1).strip()
|
|
# Skip only if it's "January 2025" (current year placeholder)
|
|
if not (date_text.lower().startswith('january') and '2025' in date_text):
|
|
# Format "January YYYY" (not 2025) as just "YYYY" for cleaner display
|
|
if re.match(r'^january\s+(\d{4})$', date_text.lower()) and '2025' not in date_text:
|
|
year_match = re.search(r'(\d{4})', date_text)
|
|
if year_match:
|
|
date_text = year_match.group(1)
|
|
# Keep other months like "August 2025" as full format
|
|
metadata['date'] = date_text
|
|
# Remove the date line from content
|
|
content = re.sub(r'^\*([A-Za-z]+ \d{4}|\d{4})\*\s*$', '', content, count=1, flags=re.MULTILINE)
|
|
|
|
# Configure mistune renderer with URL plugin for bare links
|
|
markdown = mistune.create_markdown(
|
|
escape=False,
|
|
plugins=['strikethrough', 'footnotes', 'table', 'task_lists', 'def_list', 'url']
|
|
)
|
|
|
|
# Process content to HTML
|
|
html_content = markdown(content.strip())
|
|
|
|
# Add anchor IDs to headings using post-processing on HTML
|
|
def add_heading_anchor_ids(html_content):
|
|
def replace_heading(match):
|
|
tag = match.group(1) # h1, h2, etc.
|
|
level = int(tag[1]) # 1, 2, etc.
|
|
classes = match.group(2) or '' # existing classes if any
|
|
text = match.group(3)
|
|
|
|
# Generate anchor ID from heading text (remove HTML tags first)
|
|
clean_text = re.sub(r'<[^>]+>', '', text)
|
|
anchor_id = re.sub(r'[^\w\s-]', '', clean_text.lower()).replace(' ', '-')
|
|
anchor_id = re.sub(r'-+', '-', anchor_id).strip('-') # Clean up multiple dashes
|
|
|
|
# Add id attribute, preserving any existing classes
|
|
if classes:
|
|
return f'<{tag} id="{anchor_id}"{classes}>{text}</{tag}>'
|
|
else:
|
|
return f'<{tag} id="{anchor_id}">{text}</{tag}>'
|
|
|
|
# Match h1-h6 tags with optional class attributes
|
|
return re.sub(r'<(h[1-6])(\s+[^>]*)?>([^<]+)</h[1-6]>', replace_heading, html_content)
|
|
|
|
html_content = add_heading_anchor_ids(html_content)
|
|
|
|
# Post-processing for poetry line breaks
|
|
# Check if this is likely a poetry file based on file path
|
|
if file_path and 'poetry' in str(file_path):
|
|
# For poetry, convert single line breaks within paragraphs to <br> tags
|
|
html_content = re.sub(r'<p>(.*?)</p>',
|
|
lambda m: '<p>' + m.group(1).replace('\n', '<br>\n') + '</p>',
|
|
html_content, flags=re.DOTALL)
|
|
|
|
# Add classes to headers to prevent conflicts with page headers
|
|
html_content = html_content.replace('<h1>', '<h1 class="content-header">')
|
|
html_content = html_content.replace('<h2>', '<h2 class="content-header">')
|
|
html_content = html_content.replace('<h3>', '<h3 class="content-header">')
|
|
html_content = html_content.replace('<h4>', '<h4 class="content-header">')
|
|
html_content = html_content.replace('<h5>', '<h5 class="content-header">')
|
|
html_content = html_content.replace('<h6>', '<h6 class="content-header">')
|
|
|
|
# Use the first h1 as title if available, otherwise fallback to metadata or filename
|
|
if first_h1:
|
|
title = first_h1
|
|
elif 'title' in metadata:
|
|
title = metadata['title']
|
|
else:
|
|
title = file_path.stem.replace('-', ' ').replace('_', ' ').title()
|
|
|
|
# Calculate reading time
|
|
reading_time, word_count = calculate_reading_time(html_content)
|
|
|
|
# Extract tags
|
|
tags = extract_tags_from_content(html_content, metadata, file_path)
|
|
|
|
# Find series posts if this post is part of a series
|
|
series_posts = find_series_posts(metadata, file_path)
|
|
|
|
# Generate unique icon for this content
|
|
unique_icon = generate_unique_svg_icon(title, size=32)
|
|
|
|
return {
|
|
'content': html_content,
|
|
'title': title,
|
|
'metadata': metadata,
|
|
'reading_time': reading_time,
|
|
'word_count': word_count,
|
|
'tags': tags,
|
|
'series_posts': series_posts,
|
|
'series_name': metadata.get('series'),
|
|
'unique_icon': unique_icon
|
|
}
|
|
except Exception as e:
|
|
return {
|
|
'content': f'<p>Error reading file: {str(e)}</p>',
|
|
'title': 'Error',
|
|
'metadata': {}
|
|
}
|
|
|
|
@app.route('/')
|
|
def index():
|
|
"""Homepage showcasing download statistics."""
|
|
return render_template('homepage.html',
|
|
current_year=datetime.now().year,
|
|
title="Home")
|
|
|
|
|
|
@app.route('/health')
|
|
def health_check():
|
|
"""Simple health check endpoint for monitoring."""
|
|
return {'status': 'healthy', 'timestamp': datetime.now().isoformat()}
|
|
|
|
|
|
@app.route('/search')
|
|
def search_page():
|
|
"""Search page with interactive search functionality."""
|
|
return render_template('search.html',
|
|
title='Search',
|
|
breadcrumbs=[],
|
|
current_year=datetime.now().year,
|
|
current_page='Search')
|
|
|
|
|
|
def _convert_unified_outlines_cache(unified_cache):
|
|
"""Convert unified cache outlines format to template-expected format."""
|
|
articles_list = []
|
|
|
|
for file_path, outlines in unified_cache.get('articles', {}).items():
|
|
if not outlines:
|
|
continue
|
|
|
|
try:
|
|
full_path = Path(file_path)
|
|
content_data = render_markdown_file(full_path)
|
|
pub_date = extract_intelligent_date(full_path, content_data)
|
|
relative_path = str(full_path.relative_to(DATA_DIR))
|
|
url_path = '/' + relative_path[:-3]
|
|
|
|
processed_outlines = []
|
|
for outline in outlines:
|
|
if isinstance(outline, dict) and 'text' in outline:
|
|
processed_outlines.append(outline)
|
|
|
|
if processed_outlines:
|
|
articles_list.append({
|
|
'title': content_data['title'],
|
|
'url': url_path,
|
|
'date': pub_date,
|
|
'category': full_path.parent.name.replace('-', ' ').title(),
|
|
'outlines': processed_outlines
|
|
})
|
|
except Exception:
|
|
continue
|
|
|
|
articles_list.sort(key=lambda x: x['date'] if x['date'] else datetime(1900, 1, 1), reverse=True)
|
|
return {
|
|
'articles': articles_list,
|
|
'total_count': unified_cache.get('total_count', sum(len(article['outlines']) for article in articles_list))
|
|
}
|
|
|
|
def _convert_unified_quotes_cache(unified_cache):
|
|
"""Convert unified cache quotes format to template-expected format."""
|
|
articles_list = []
|
|
|
|
for file_path, quotes in unified_cache.get('articles', {}).items():
|
|
if not quotes:
|
|
continue
|
|
|
|
try:
|
|
full_path = Path(file_path)
|
|
content_data = render_markdown_file(full_path)
|
|
pub_date = extract_intelligent_date(full_path, content_data)
|
|
relative_path = str(full_path.relative_to(DATA_DIR))
|
|
url_path = '/' + relative_path[:-3]
|
|
|
|
processed_quotes = []
|
|
for quote in quotes:
|
|
if isinstance(quote, dict) and 'text' in quote:
|
|
processed_quotes.append(quote)
|
|
|
|
if processed_quotes:
|
|
articles_list.append({
|
|
'title': content_data['title'],
|
|
'url': url_path,
|
|
'date': pub_date,
|
|
'category': full_path.parent.name.replace('-', ' ').title(),
|
|
'quotes': processed_quotes
|
|
})
|
|
except Exception:
|
|
continue
|
|
|
|
articles_list.sort(key=lambda x: x['date'] if x['date'] else datetime(1900, 1, 1), reverse=True)
|
|
return {
|
|
'articles': articles_list,
|
|
'total_count': unified_cache.get('total_count', sum(len(article['quotes']) for article in articles_list))
|
|
}
|
|
|
|
def _convert_unified_connections_cache(unified_cache):
|
|
"""Convert unified cache connections format to template-expected format."""
|
|
return {
|
|
'outgoing_refs': unified_cache.get('outgoing_refs', {}),
|
|
'incoming_refs': unified_cache.get('incoming_refs', {}),
|
|
'total_count': unified_cache.get('total_count', 0)
|
|
}
|
|
|
|
def _convert_unified_terms_cache(unified_cache):
|
|
"""Convert unified cache terms format to template-expected format."""
|
|
return {
|
|
'terms': unified_cache.get('terms', []),
|
|
'total_occurrences': unified_cache.get('total_occurrences', 0)
|
|
}
|
|
|
|
def _convert_unified_sidenotes_cache(unified_cache):
|
|
"""Convert unified cache sidenotes format to template-expected format."""
|
|
articles_list = []
|
|
|
|
# The unified cache has structure: {'articles': {file_path: [sidenotes]}, 'total_count': int}
|
|
articles_data = unified_cache.get('articles', {})
|
|
|
|
for file_path, sidenotes in articles_data.items():
|
|
if not sidenotes:
|
|
continue
|
|
|
|
try:
|
|
# Get file info
|
|
full_path = Path(file_path)
|
|
content_data = render_markdown_file(full_path)
|
|
|
|
# Extract date for sorting
|
|
pub_date = extract_intelligent_date(full_path, content_data)
|
|
|
|
# Create URL for this file
|
|
relative_path = str(full_path.relative_to(DATA_DIR))
|
|
url_path = '/' + relative_path[:-3] # Remove .md extension
|
|
|
|
# Convert sidenotes to expected format
|
|
processed_sidenotes = []
|
|
for sidenote in sidenotes:
|
|
if isinstance(sidenote, dict) and 'text' in sidenote:
|
|
processed_sidenotes.append({
|
|
'text': sidenote['text'],
|
|
'id': sidenote.get('id') # May be None
|
|
})
|
|
|
|
if processed_sidenotes:
|
|
articles_list.append({
|
|
'title': content_data['title'],
|
|
'url': url_path,
|
|
'date': pub_date,
|
|
'category': full_path.parent.name.replace('-', ' ').title(),
|
|
'sidenotes': processed_sidenotes
|
|
})
|
|
|
|
except Exception as e:
|
|
# Skip files that can't be processed
|
|
continue
|
|
|
|
# Sort by date (most recent first)
|
|
articles_list.sort(key=lambda x: x['date'] if x['date'] else datetime(1900, 1, 1), reverse=True)
|
|
|
|
print(f"Sidenotes conversion: {len(articles_list)} articles processed from {len(unified_cache.get('articles', {}))} files")
|
|
return {
|
|
'articles': articles_list,
|
|
'total_count': unified_cache.get('total_count', sum(len(article['sidenotes']) for article in articles_list))
|
|
}
|
|
|
|
def _extract_all_sidenotes_cached():
|
|
"""Return pre-loaded sidenotes cache data (pure RAM, no TTL)."""
|
|
# Return pre-loaded cache data if available
|
|
if _sidenotes_cache['data'] is not None:
|
|
return _sidenotes_cache['data']
|
|
|
|
# Fallback to rebuild if cache somehow wasn't initialized
|
|
import glob
|
|
from collections import defaultdict
|
|
|
|
articles_with_sidenotes = defaultdict(list)
|
|
|
|
# Get all markdown files from /data/ directory
|
|
all_files = glob.glob('data/**/*.md', recursive=True)
|
|
|
|
# Filter out index files
|
|
all_files = [f for f in all_files if not f.endswith('index.md')]
|
|
|
|
for file_path in all_files:
|
|
try:
|
|
# Read the file and render it
|
|
full_path = Path(file_path)
|
|
content_data = render_markdown_file(full_path)
|
|
html_content = content_data['content']
|
|
|
|
# Extract sidenotes from the HTML using regex
|
|
# Pattern matches <span class="sidenote">content</span>
|
|
sidenote_pattern = r'<span class="sidenote">(.*?)</span>'
|
|
file_sidenotes = re.findall(sidenote_pattern, html_content, re.DOTALL)
|
|
|
|
if file_sidenotes:
|
|
# Create URL for this file
|
|
relative_path = str(full_path.relative_to(DATA_DIR))
|
|
url_path = '/' + relative_path[:-3] # Remove .md extension
|
|
|
|
# Extract date for sorting
|
|
pub_date = extract_intelligent_date(full_path, content_data)
|
|
|
|
# Clean up sidenotes and add to article group with IDs
|
|
cleaned_sidenotes = []
|
|
|
|
# Also extract sidenote IDs from the HTML
|
|
# Pattern to match the full sidenote structure with ID
|
|
full_pattern = r'<input type="checkbox" id="(sn-[^"]+)"[^>]*\/><span class="sidenote">(.*?)</span>'
|
|
full_matches = re.findall(full_pattern, html_content, re.DOTALL)
|
|
|
|
if full_matches:
|
|
# We have IDs for the sidenotes
|
|
for sidenote_id, sidenote_text in full_matches:
|
|
# Remove HTML links but keep the link text
|
|
sidenote_text = re.sub(r'<a[^>]*?>(.*?)</a>', r'\1', sidenote_text)
|
|
# Clean up the sidenote text (remove extra whitespace)
|
|
sidenote_text = re.sub(r'\s+', ' ', sidenote_text).strip()
|
|
cleaned_sidenotes.append({
|
|
'text': sidenote_text,
|
|
'id': sidenote_id
|
|
})
|
|
else:
|
|
# Fallback for sidenotes without IDs
|
|
for i, sidenote in enumerate(file_sidenotes):
|
|
# Remove HTML links but keep the link text
|
|
sidenote_text = re.sub(r'<a[^>]*?>(.*?)</a>', r'\1', sidenote)
|
|
# Clean up the sidenote text (remove extra whitespace)
|
|
sidenote_text = re.sub(r'\s+', ' ', sidenote_text).strip()
|
|
cleaned_sidenotes.append({
|
|
'text': sidenote_text,
|
|
'id': None
|
|
})
|
|
|
|
articles_with_sidenotes[content_data['title']].append({
|
|
'sidenotes': cleaned_sidenotes,
|
|
'url': url_path,
|
|
'date': pub_date,
|
|
'category': full_path.parent.name.replace('-', ' ').title()
|
|
})
|
|
except Exception as e:
|
|
# Skip files that can't be processed
|
|
continue
|
|
|
|
# Convert to list and sort by date (most recent first)
|
|
articles_list = []
|
|
for title, article_data in articles_with_sidenotes.items():
|
|
# Should only be one entry per article
|
|
data = article_data[0]
|
|
articles_list.append({
|
|
'title': title,
|
|
'url': data['url'],
|
|
'date': data['date'],
|
|
'category': data['category'],
|
|
'sidenotes': data['sidenotes']
|
|
})
|
|
|
|
articles_list.sort(key=lambda x: x['date'] if x['date'] else datetime(1900, 1, 1), reverse=True)
|
|
|
|
# Count total sidenotes
|
|
total_count = sum(len(article['sidenotes']) for article in articles_list)
|
|
|
|
# Update cache
|
|
result = {
|
|
'articles': articles_list,
|
|
'total_count': total_count
|
|
}
|
|
_sidenotes_cache['data'] = result
|
|
|
|
return result
|
|
|
|
|
|
@app.route('/sidenotes')
|
|
def sidenotes_index():
|
|
"""Extract and display all sidenotes from across the site as an index."""
|
|
# Use clean MetadataCache interface
|
|
sidenotes_data = metadata_cache.get_sidenotes()
|
|
|
|
return render_template('sidenotes.html',
|
|
articles=sidenotes_data['articles'],
|
|
total_count=sidenotes_data['total_count'],
|
|
title='Sidenotes Index',
|
|
breadcrumbs=[],
|
|
current_year=datetime.now().year,
|
|
current_page='Sidenotes')
|
|
|
|
|
|
def _extract_all_outlines_cached():
|
|
"""Return pre-loaded outlines cache data (pure RAM, no TTL)."""
|
|
# Return pre-loaded cache data if available
|
|
if _outlines_cache['data'] is not None:
|
|
return _outlines_cache['data']
|
|
|
|
# Fallback to rebuild if cache somehow wasn't initialized
|
|
import glob
|
|
from collections import defaultdict
|
|
|
|
articles_with_outlines = defaultdict(list)
|
|
|
|
# Get all markdown files from /data/ directory
|
|
all_files = glob.glob('data/**/*.md', recursive=True)
|
|
|
|
# Filter out index files
|
|
all_files = [f for f in all_files if not f.endswith('index.md')]
|
|
|
|
for file_path in all_files:
|
|
try:
|
|
# Read the file and render it
|
|
full_path = Path(file_path)
|
|
content_data = render_markdown_file(full_path)
|
|
html_content = content_data['content']
|
|
|
|
# Extract headings from the HTML using regex
|
|
# Pattern matches <h1>, <h2>, <h3>, etc. with optional IDs and content
|
|
heading_pattern = r'<h([1-6])(?:[^>]*id="([^"]*)")?[^>]*>([^<]+)</h[1-6]>'
|
|
headings = re.findall(heading_pattern, html_content)
|
|
|
|
if headings:
|
|
# Create URL for this file
|
|
relative_path = str(full_path.relative_to(DATA_DIR))
|
|
url_path = '/' + relative_path[:-3] # Remove .md extension
|
|
|
|
# Extract date for sorting
|
|
pub_date = extract_intelligent_date(full_path, content_data)
|
|
|
|
# Clean up headings and create outline structure
|
|
cleaned_headings = []
|
|
for level, heading_id, heading_text in headings:
|
|
# Skip h1 if it matches the title (avoid duplication)
|
|
if level == '1' and heading_text.strip() == content_data['title'].strip():
|
|
continue
|
|
|
|
cleaned_headings.append({
|
|
'level': int(level),
|
|
'text': heading_text.strip(),
|
|
'id': heading_id if heading_id else None,
|
|
'anchor_url': f"{url_path}#{heading_id}" if heading_id else url_path
|
|
})
|
|
|
|
if cleaned_headings: # Only add if there are headings after filtering
|
|
articles_with_outlines[content_data['title']].append({
|
|
'headings': cleaned_headings,
|
|
'url': url_path,
|
|
'date': pub_date,
|
|
'category': full_path.parent.name.replace('-', ' ').title()
|
|
})
|
|
except Exception as e:
|
|
# Skip files that can't be processed
|
|
continue
|
|
|
|
# Convert to list and sort by date (most recent first)
|
|
articles_list = []
|
|
for title, article_data in articles_with_outlines.items():
|
|
# Should only be one entry per article
|
|
data = article_data[0]
|
|
articles_list.append({
|
|
'title': title,
|
|
'url': data['url'],
|
|
'date': data['date'],
|
|
'category': data['category'],
|
|
'headings': data['headings']
|
|
})
|
|
|
|
articles_list.sort(key=lambda x: x['date'] if x['date'] else datetime(1900, 1, 1), reverse=True)
|
|
|
|
# Count total headings
|
|
total_count = sum(len(article['headings']) for article in articles_list)
|
|
|
|
# Update cache
|
|
result = {
|
|
'articles': articles_list,
|
|
'total_count': total_count
|
|
}
|
|
_outlines_cache['data'] = result
|
|
|
|
return result
|
|
|
|
|
|
@app.route('/outlines')
|
|
def outlines_index():
|
|
"""Extract and display all essay outlines from across the site as an index."""
|
|
# Use clean MetadataCache interface
|
|
outlines_data = metadata_cache.get_outlines()
|
|
|
|
return render_template('outlines.html',
|
|
articles=outlines_data['articles'],
|
|
total_count=outlines_data['total_count'],
|
|
title='Outlines Index',
|
|
breadcrumbs=[],
|
|
current_year=datetime.now().year,
|
|
current_page='Outlines')
|
|
|
|
|
|
def _extract_all_quotes_cached():
|
|
"""Return pre-loaded quotes cache data (pure RAM, no TTL)."""
|
|
# Return pre-loaded cache data if available
|
|
if _quotes_cache['data'] is not None:
|
|
return _quotes_cache['data']
|
|
|
|
# Fallback to rebuild if cache somehow wasn't initialized
|
|
import glob
|
|
from collections import defaultdict
|
|
|
|
articles_with_quotes = defaultdict(list)
|
|
|
|
# Get all markdown files from /data/ directory
|
|
all_files = glob.glob('data/**/*.md', recursive=True)
|
|
|
|
# Filter out index files
|
|
all_files = [f for f in all_files if not f.endswith('index.md')]
|
|
|
|
for file_path in all_files:
|
|
try:
|
|
# Read the file and render it
|
|
full_path = Path(file_path)
|
|
content_data = render_markdown_file(full_path)
|
|
html_content = content_data['content']
|
|
|
|
# Extract blockquotes from the HTML using regex
|
|
# Pattern matches <blockquote>content</blockquote>
|
|
quote_pattern = r'<blockquote[^>]*>(.*?)</blockquote>'
|
|
quotes = re.findall(quote_pattern, html_content, re.DOTALL)
|
|
|
|
if quotes:
|
|
# Create URL for this file
|
|
relative_path = str(full_path.relative_to(DATA_DIR))
|
|
url_path = '/' + relative_path[:-3] # Remove .md extension
|
|
|
|
# Extract date for sorting
|
|
pub_date = extract_intelligent_date(full_path, content_data)
|
|
|
|
# Clean up quotes
|
|
cleaned_quotes = []
|
|
for quote in quotes:
|
|
# Skip quotes that start with bold labels (like "Note:", "Analysis:", "The Prompt:", etc.)
|
|
# Pattern matches: <p><strong>Label</strong>: content or similar
|
|
if re.match(r'^\s*<p[^>]*><(?:strong|b)[^>]*>[^<]*</(?:strong|b)>:', quote):
|
|
continue
|
|
|
|
# Remove inner HTML tags but preserve basic formatting
|
|
quote_text = re.sub(r'<(?!/?(?:em|strong|i|b)\b)[^>]*>', '', quote)
|
|
quote_text = re.sub(r'\s+', ' ', quote_text).strip()
|
|
|
|
# Skip very short quotes (likely not substantive)
|
|
if len(quote_text) > 20:
|
|
cleaned_quotes.append(quote_text)
|
|
|
|
if cleaned_quotes:
|
|
articles_with_quotes[content_data['title']].append({
|
|
'quotes': cleaned_quotes,
|
|
'url': url_path,
|
|
'date': pub_date,
|
|
'category': full_path.parent.name.replace('-', ' ').title()
|
|
})
|
|
except Exception as e:
|
|
# Skip files that can't be processed
|
|
continue
|
|
|
|
# Convert to list and sort by date (most recent first)
|
|
articles_list = []
|
|
for title, article_data in articles_with_quotes.items():
|
|
# Should only be one entry per article
|
|
data = article_data[0]
|
|
articles_list.append({
|
|
'title': title,
|
|
'url': data['url'],
|
|
'date': data['date'],
|
|
'category': data['category'],
|
|
'quotes': data['quotes']
|
|
})
|
|
|
|
articles_list.sort(key=lambda x: x['date'] if x['date'] else datetime(1900, 1, 1), reverse=True)
|
|
|
|
# Count total quotes
|
|
total_count = sum(len(article['quotes']) for article in articles_list)
|
|
|
|
# Update cache
|
|
result = {
|
|
'articles': articles_list,
|
|
'total_count': total_count
|
|
}
|
|
_quotes_cache['data'] = result
|
|
|
|
return result
|
|
|
|
|
|
@app.route('/quotes')
|
|
def quotes_index():
|
|
"""Extract and display all blockquotes from across the site as an index."""
|
|
# Use clean MetadataCache interface
|
|
quotes_data = metadata_cache.get_quotes()
|
|
|
|
return render_template('quotes.html',
|
|
articles=quotes_data['articles'],
|
|
total_count=quotes_data['total_count'],
|
|
title='Quotes Index',
|
|
breadcrumbs=[],
|
|
current_year=datetime.now().year,
|
|
current_page='Quotes')
|
|
|
|
|
|
def _extract_all_connections_cached():
|
|
"""Return pre-loaded connections cache data (pure RAM, no TTL)."""
|
|
# Return pre-loaded cache data if available
|
|
if _connections_cache['data'] is not None:
|
|
return _connections_cache['data']
|
|
|
|
# Fallback to rebuild if cache somehow wasn't initialized
|
|
import glob
|
|
from collections import defaultdict
|
|
|
|
# Track both outgoing and incoming connections
|
|
articles_data = {} # url -> {title, date, category, outgoing_connections}
|
|
incoming_connections = defaultdict(list) # target_url -> [source connections]
|
|
|
|
# Get all markdown files from /data/ directory
|
|
all_files = glob.glob('data/**/*.md', recursive=True)
|
|
|
|
# Filter out index files
|
|
all_files = [f for f in all_files if not f.endswith('index.md')]
|
|
|
|
# First pass: collect all articles and their outgoing connections
|
|
for file_path in all_files:
|
|
try:
|
|
# Read the file and render it
|
|
full_path = Path(file_path)
|
|
content_data = render_markdown_file(full_path)
|
|
html_content = content_data['content']
|
|
|
|
# Create URL for this file
|
|
relative_path = str(full_path.relative_to(DATA_DIR))
|
|
source_url = '/' + relative_path[:-3] # Remove .md extension
|
|
|
|
# Extract date for sorting
|
|
pub_date = extract_intelligent_date(full_path, content_data)
|
|
|
|
# Initialize article data
|
|
articles_data[source_url] = {
|
|
'title': content_data['title'],
|
|
'url': source_url,
|
|
'date': pub_date,
|
|
'category': full_path.parent.name.replace('-', ' ').title(),
|
|
'outgoing_connections': []
|
|
}
|
|
|
|
# Extract internal links from the HTML
|
|
# Pattern matches <a href="/internal/path">link text</a>
|
|
link_pattern = r'<a[^>]*href="(/[^"]*)"[^>]*>(.*?)</a>'
|
|
links = re.findall(link_pattern, html_content, re.DOTALL)
|
|
|
|
# Collect outgoing connections for this article
|
|
for link_url, link_text in links:
|
|
if (link_url.startswith('/') and
|
|
not link_url.startswith('//') and
|
|
not link_url.startswith('/static') and
|
|
link_url != source_url): # Don't include self-references
|
|
|
|
# Clean up link text
|
|
link_text = re.sub(r'<[^>]*>', '', link_text)
|
|
link_text = re.sub(r'\s+', ' ', link_text).strip()
|
|
|
|
connection = {
|
|
'target_url': link_url,
|
|
'link_text': link_text,
|
|
'source_url': source_url,
|
|
'source_title': content_data['title']
|
|
}
|
|
|
|
# Add to outgoing connections
|
|
articles_data[source_url]['outgoing_connections'].append({
|
|
'target_url': link_url,
|
|
'link_text': link_text
|
|
})
|
|
|
|
# Add to incoming connections map
|
|
incoming_connections[link_url].append({
|
|
'source_url': source_url,
|
|
'source_title': content_data['title'],
|
|
'link_text': link_text
|
|
})
|
|
|
|
except Exception as e:
|
|
# Skip files that can't be processed
|
|
continue
|
|
|
|
# Second pass: add incoming connections to each article
|
|
for url, article in articles_data.items():
|
|
article['incoming_connections'] = incoming_connections.get(url, [])
|
|
|
|
# Convert to list format and filter articles with connections
|
|
articles_list = []
|
|
for url, article in articles_data.items():
|
|
# Only include articles that have outgoing OR incoming connections
|
|
if article['outgoing_connections'] or article['incoming_connections']:
|
|
articles_list.append({
|
|
'title': article['title'],
|
|
'url': article['url'],
|
|
'date': article['date'],
|
|
'category': article['category'],
|
|
'connections': article['outgoing_connections'], # Keep for backward compatibility
|
|
'outgoing_connections': article['outgoing_connections'],
|
|
'incoming_connections': article['incoming_connections']
|
|
})
|
|
|
|
articles_list.sort(key=lambda x: x['date'] if x['date'] else datetime(1900, 1, 1), reverse=True)
|
|
|
|
# Count total connections (both directions)
|
|
total_outgoing = sum(len(article['outgoing_connections']) for article in articles_list)
|
|
total_incoming = sum(len(article['incoming_connections']) for article in articles_list)
|
|
|
|
# Update cache
|
|
result = {
|
|
'articles': articles_list,
|
|
'total_count': total_outgoing, # Keep backward compatibility
|
|
'total_outgoing': total_outgoing,
|
|
'total_incoming': total_incoming
|
|
}
|
|
_connections_cache['data'] = result
|
|
|
|
return result
|
|
|
|
|
|
@app.route('/connections')
|
|
def connections_index():
|
|
"""Extract and display all cross-references between essays."""
|
|
# Use clean MetadataCache interface
|
|
connections_data = metadata_cache.get_connections()
|
|
|
|
return render_template('connections.html',
|
|
articles=connections_data['articles'],
|
|
total_count=connections_data['total_count'],
|
|
total_outgoing=connections_data.get('total_outgoing'),
|
|
total_incoming=connections_data.get('total_incoming'),
|
|
title='Connections Index',
|
|
breadcrumbs=[],
|
|
current_year=datetime.now().year,
|
|
current_page='Connections')
|
|
|
|
|
|
@app.route('/graph/data')
|
|
def graph_data():
|
|
"""API endpoint that returns graph data for network visualization."""
|
|
# Use clean MetadataCache interface
|
|
connections_data = metadata_cache.get_connections()
|
|
|
|
nodes = []
|
|
edges = []
|
|
node_ids = set()
|
|
|
|
# Create nodes and edges from connections
|
|
for article in connections_data['articles']:
|
|
source_id = article['url']
|
|
node_ids.add(source_id)
|
|
|
|
# Use outgoing_connections for graph edges (backward compatibility: also check connections)
|
|
connections_list = article.get('outgoing_connections', article.get('connections', []))
|
|
for connection in connections_list:
|
|
target_id = connection['target_url']
|
|
node_ids.add(target_id)
|
|
|
|
edges.append({
|
|
'source': source_id,
|
|
'target': target_id,
|
|
'link_text': connection['link_text']
|
|
})
|
|
|
|
# Create node objects with titles using MetadataCache
|
|
posts = metadata_cache.get_blog_posts()
|
|
post_lookup = {post['url']: post for post in posts}
|
|
|
|
for node_id in node_ids:
|
|
post = post_lookup.get(node_id)
|
|
nodes.append({
|
|
'id': node_id,
|
|
'title': post['title'] if post else node_id.split('/')[-1],
|
|
'category': post['category'] if post else 'Unknown',
|
|
'url': node_id
|
|
})
|
|
|
|
return jsonify({
|
|
'nodes': nodes,
|
|
'edges': edges,
|
|
'stats': {
|
|
'total_nodes': len(nodes),
|
|
'total_edges': len(edges)
|
|
}
|
|
})
|
|
|
|
|
|
@app.route('/graph')
|
|
def graph_visualization():
|
|
"""Interactive network graph of cross-references."""
|
|
return render_template('graph.html',
|
|
title='Cross-Reference Graph',
|
|
breadcrumbs=[],
|
|
current_year=datetime.now().year,
|
|
current_page='Cross-Reference Graph')
|
|
|
|
|
|
|
|
@app.route('/terms')
|
|
def terms_index():
|
|
"""Extract and display all significant terms like a book index."""
|
|
# Use clean MetadataCache interface
|
|
terms_data = metadata_cache.get_terms()
|
|
|
|
return render_template('terms.html',
|
|
terms=terms_data['terms'],
|
|
total_terms=terms_data['total_terms'],
|
|
total_occurrences=terms_data['total_occurrences'],
|
|
title='Term Index',
|
|
breadcrumbs=[],
|
|
current_year=datetime.now().year,
|
|
current_page='Term Index')
|
|
|
|
|
|
|
|
@app.route('/random')
|
|
def random_post():
|
|
"""Redirect to a random document from anywhere in /data/."""
|
|
import random
|
|
import glob
|
|
|
|
# Get all markdown files from /data/ directory
|
|
all_files = glob.glob('data/**/*.md', recursive=True)
|
|
|
|
# Filter out index files
|
|
all_files = [f for f in all_files if not f.endswith('index.md')]
|
|
|
|
if not all_files:
|
|
return redirect('/directory')
|
|
|
|
# Choose random file and convert to URL
|
|
random_file = random.choice(all_files)
|
|
# Convert data/essays/2010-01-example.md -> /essays/2010-01-example
|
|
url_path = '/' + random_file.replace('data/', '').replace('.md', '')
|
|
return redirect(url_path)
|
|
|
|
|
|
def get_random_personality_from_collection(collection_path):
|
|
"""Helper function to get a random personality from a collection."""
|
|
import random
|
|
import glob
|
|
|
|
if collection_path:
|
|
# Get files from specific collection
|
|
pattern = f'data/artificial-intelligence/personalities/{collection_path}/*.md'
|
|
fallback_url = f'/artificial-intelligence/personalities/{collection_path}'
|
|
else:
|
|
# Get all personality files
|
|
pattern = 'data/artificial-intelligence/personalities/**/*.md'
|
|
fallback_url = '/artificial-intelligence/personalities'
|
|
|
|
personality_files = glob.glob(pattern, recursive=True)
|
|
# Filter out index files
|
|
personality_files = [f for f in personality_files if not f.endswith('index.md')]
|
|
|
|
if not personality_files:
|
|
return redirect(fallback_url)
|
|
|
|
# Choose random personality and convert to URL
|
|
random_file = random.choice(personality_files)
|
|
# Convert data/artificial-intelligence/personalities/major-arcana/the-fool.md -> /artificial-intelligence/personalities/major-arcana/the-fool
|
|
url_path = '/' + random_file.replace('data/', '').replace('.md', '')
|
|
return redirect(url_path)
|
|
|
|
|
|
@app.route('/random/personality')
|
|
@app.route('/random/personality/')
|
|
def random_personality():
|
|
"""Redirect to a random AI personality from any collection."""
|
|
return get_random_personality_from_collection(None)
|
|
|
|
|
|
@app.route('/random/<collection>')
|
|
def random_from_collection(collection):
|
|
"""Redirect to a random personality from a specific collection."""
|
|
# Validate collection exists
|
|
valid_collections = [
|
|
'major-arcana', 'seven-virtues', 'programming-languages',
|
|
'greek-pantheon', 'roman-pantheon', 'hindu-pantheon',
|
|
'operating-systems', 'supporting-cast', 'goddess-archetypes',
|
|
'biblical-characters', 'biblical-anthology'
|
|
]
|
|
|
|
if collection not in valid_collections:
|
|
return redirect('/artificial-intelligence/personalities')
|
|
|
|
return get_random_personality_from_collection(collection)
|
|
|
|
|
|
@app.route('/archive')
|
|
def archive_index():
|
|
"""Archive index showing all posts by year."""
|
|
posts = metadata_cache.get_blog_posts()
|
|
|
|
# Group posts by year
|
|
grouped_posts = {}
|
|
for post in posts:
|
|
year = post['pub_date'].year
|
|
if year not in grouped_posts:
|
|
grouped_posts[year] = []
|
|
grouped_posts[year].append(post)
|
|
|
|
# Sort each year's posts by date (most recent first) and years in descending order
|
|
for year in grouped_posts:
|
|
grouped_posts[year].sort(key=lambda x: x['pub_date'], reverse=True)
|
|
|
|
grouped_posts = dict(sorted(grouped_posts.items(), reverse=True))
|
|
|
|
return render_template('archive.html',
|
|
archive_title='Complete',
|
|
archive_description=None,
|
|
grouped_posts=grouped_posts,
|
|
breadcrumbs=[],
|
|
current_year=datetime.now().year,
|
|
current_page='Archive')
|
|
|
|
|
|
@app.route('/archive/<int:year>')
|
|
def archive_year(year):
|
|
"""Archive for a specific year."""
|
|
posts = metadata_cache.get_blog_posts()
|
|
|
|
# Filter posts for the specific year
|
|
year_posts = [post for post in posts if post['pub_date'].year == year]
|
|
|
|
if not year_posts:
|
|
abort(404)
|
|
|
|
# Group posts by month
|
|
grouped_posts = {}
|
|
month_names = ['', 'January', 'February', 'March', 'April', 'May', 'June',
|
|
'July', 'August', 'September', 'October', 'November', 'December']
|
|
|
|
for post in year_posts:
|
|
month_name = month_names[post['pub_date'].month]
|
|
if month_name not in grouped_posts:
|
|
grouped_posts[month_name] = []
|
|
grouped_posts[month_name].append(post)
|
|
|
|
# Sort posts within each month by date (most recent first)
|
|
for month in grouped_posts:
|
|
grouped_posts[month].sort(key=lambda x: x['pub_date'], reverse=True)
|
|
|
|
# Sort months in chronological order (most recent first)
|
|
month_order = {name: idx for idx, name in enumerate(month_names[1:], 1)}
|
|
grouped_posts = dict(sorted(grouped_posts.items(),
|
|
key=lambda x: month_order[x[0]], reverse=True))
|
|
|
|
breadcrumbs = [{'name': 'Archive', 'url': '/archive'}]
|
|
|
|
return render_template('archive.html',
|
|
archive_title=str(year),
|
|
archive_description=f'Essays and AI writings from {year}.',
|
|
grouped_posts=grouped_posts,
|
|
breadcrumbs=breadcrumbs,
|
|
current_year=datetime.now().year,
|
|
current_page=f'{year} Archive')
|
|
|
|
|
|
@app.route('/archive/<int:year>/<int:month>')
|
|
def archive_month(year, month):
|
|
"""Archive for a specific month and year."""
|
|
posts = metadata_cache.get_blog_posts()
|
|
|
|
# Filter posts for the specific month and year
|
|
month_posts = [post for post in posts
|
|
if post['pub_date'].year == year and post['pub_date'].month == month]
|
|
|
|
if not month_posts:
|
|
abort(404)
|
|
|
|
# Group by category (single level for monthly view)
|
|
grouped_posts = {}
|
|
for post in month_posts:
|
|
category = post['category']
|
|
if category not in grouped_posts:
|
|
grouped_posts[category] = []
|
|
grouped_posts[category].append(post)
|
|
|
|
month_names = ['', 'January', 'February', 'March', 'April', 'May', 'June',
|
|
'July', 'August', 'September', 'October', 'November', 'December']
|
|
month_name = month_names[month]
|
|
|
|
breadcrumbs = [
|
|
{'name': 'Archive', 'url': '/archive'},
|
|
{'name': str(year), 'url': f'/archive/{year}'}
|
|
]
|
|
|
|
return render_template('archive.html',
|
|
archive_title=f'{month_name} {year}',
|
|
archive_description=f'Essays and AI writings from {month_name} {year}.',
|
|
grouped_posts=grouped_posts,
|
|
breadcrumbs=breadcrumbs,
|
|
current_year=datetime.now().year,
|
|
current_page=f'{month_name} {year} Archive')
|
|
|
|
|
|
@app.route('/themes')
|
|
def themes_index():
|
|
"""Themes page - just displays the index.md content."""
|
|
themes_path = DATA_DIR / 'themes'
|
|
|
|
# Check for index.md in the themes directory
|
|
index_file = themes_path / 'index.md'
|
|
if index_file.exists():
|
|
content_data = render_markdown_file(index_file)
|
|
# Generate folder icon for themes directory
|
|
folder_icon = generate_folder_icon('Themes', size=32)
|
|
return render_template('post.html',
|
|
content=content_data['content'],
|
|
title='Themes',
|
|
metadata=content_data.get('metadata', {}),
|
|
breadcrumbs=[],
|
|
current_year=datetime.now().year,
|
|
current_page='Themes',
|
|
unique_icon=folder_icon,
|
|
parent_directory=None)
|
|
else:
|
|
# Fallback to directory listing if no index.md
|
|
return serve_path('themes')
|
|
|
|
|
|
@app.route('/directory')
|
|
def directory_index():
|
|
"""Directory listing that was previously the homepage."""
|
|
items = get_directory_structure(DATA_DIR)
|
|
|
|
# Check for index.md in the root data directory
|
|
index_file = DATA_DIR / 'index.md'
|
|
index_content = None
|
|
content_position = 'top' # Default position
|
|
if index_file.exists():
|
|
index_content = render_markdown_file(index_file)
|
|
|
|
# Determine content position based on length
|
|
# Count words in the HTML content (after stripping HTML tags)
|
|
content_text = re.sub(r'<[^>]+>', '', index_content['content'])
|
|
word_count = len(content_text.split())
|
|
|
|
# If content is longer than 150 words, put it at the bottom
|
|
if word_count > 150:
|
|
content_position = 'bottom'
|
|
|
|
# Check if root directory is an image gallery
|
|
image_items = [item for item in items if item['is_image']]
|
|
total_files = [item for item in items if not item['is_dir']]
|
|
is_image_gallery = len(image_items) >= 3 and len(total_files) > 0 and (len(image_items) / len(total_files)) >= 0.5
|
|
|
|
return render_template('directory.html',
|
|
items=items,
|
|
current_path='',
|
|
title='Kenneth Reitz',
|
|
breadcrumbs=[],
|
|
index_content=index_content,
|
|
content_position=content_position,
|
|
is_image_gallery=is_image_gallery,
|
|
image_items=image_items,
|
|
current_year=datetime.now().year)
|
|
|
|
@app.route('/<path:path>')
|
|
def serve_path(path):
|
|
"""Serve files and directories from the data folder."""
|
|
full_path = DATA_DIR / path
|
|
|
|
# If the path doesn't exist, try adding .md extension for markdown files
|
|
if not full_path.exists():
|
|
md_path = DATA_DIR / (path + '.md')
|
|
if md_path.exists() and md_path.suffix == '.md':
|
|
full_path = md_path
|
|
else:
|
|
abort(404)
|
|
|
|
# Generate breadcrumbs
|
|
# For clean URLs, we need to handle the case where path might not include .md
|
|
original_path = path
|
|
if full_path.suffix == '.md' and not path.endswith('.md'):
|
|
# This is a clean URL for a markdown file
|
|
path_parts = path.split('/')
|
|
else:
|
|
path_parts = path.split('/')
|
|
|
|
breadcrumbs = []
|
|
current = ''
|
|
for part in path_parts[:-1]: # Exclude the current page
|
|
current = f"{current}/{part}" if current else part
|
|
breadcrumbs.append({
|
|
'name': part.replace('-', ' ').replace('_', ' ').title(),
|
|
'url': f"/{current}"
|
|
})
|
|
|
|
if full_path.is_dir():
|
|
# Directory listing
|
|
items = get_directory_structure(full_path)
|
|
|
|
# Check if this is an image gallery (50% or more images)
|
|
image_items = [item for item in items if item['is_image']]
|
|
total_files = [item for item in items if not item['is_dir']]
|
|
is_image_gallery = len(image_items) >= 3 and len(total_files) > 0 and (len(image_items) / len(total_files)) >= 0.5
|
|
|
|
# Check for index.md in the directory
|
|
index_file = full_path / 'index.md'
|
|
index_content = None
|
|
content_position = 'top' # Default position
|
|
if index_file.exists():
|
|
index_content = render_markdown_file(index_file)
|
|
|
|
# Determine content position based on length
|
|
# Count words in the HTML content (after stripping HTML tags)
|
|
content_text = re.sub(r'<[^>]+>', '', index_content['content'])
|
|
word_count = len(content_text.split())
|
|
|
|
# If content is longer than 150 words, put it at the bottom
|
|
if word_count > 150:
|
|
content_position = 'bottom'
|
|
|
|
# Use title from index.md if available, otherwise fall back to directory name
|
|
if index_content and index_content.get('title'):
|
|
title = index_content['title']
|
|
else:
|
|
title = path_parts[-1].replace('-', ' ').replace('_', ' ').title()
|
|
|
|
# Generate parent directory information for back link
|
|
parent_directory = None
|
|
if full_path.parent != DATA_DIR: # Don't show parent for root-level content
|
|
parent_path = full_path.parent
|
|
parent_display_name = parent_path.name.replace('-', ' ').replace('_', ' ').title()
|
|
parent_url = '/' + str(parent_path.relative_to(DATA_DIR))
|
|
if parent_url == '/':
|
|
parent_url = '/directory'
|
|
parent_icon = generate_folder_icon(parent_display_name, size=20)
|
|
|
|
parent_directory = {
|
|
'display_name': parent_display_name,
|
|
'url': parent_url,
|
|
'icon': parent_icon
|
|
}
|
|
|
|
return render_template('directory.html',
|
|
items=items,
|
|
current_path=original_path,
|
|
title=title,
|
|
breadcrumbs=breadcrumbs,
|
|
index_content=index_content,
|
|
content_position=content_position,
|
|
is_image_gallery=is_image_gallery,
|
|
image_items=image_items,
|
|
parent_directory=parent_directory,
|
|
current_year=datetime.now().year,
|
|
current_page=title)
|
|
|
|
elif full_path.suffix == '.md':
|
|
# Markdown file
|
|
content_data = render_markdown_file(full_path)
|
|
|
|
# Find related posts for essays and AI writings
|
|
related_posts = []
|
|
prev_post = None
|
|
next_post = None
|
|
if 'essays' in path or ('artificial-intelligence' in path and 'writings' in path):
|
|
related_posts = find_related_posts(str(full_path.relative_to(DATA_DIR)))
|
|
prev_post, next_post = find_adjacent_posts(str(full_path.relative_to(DATA_DIR)))
|
|
|
|
# Generate description from content for social sharing
|
|
content_text = re.sub(r'<[^>]+>', '', content_data['content'])
|
|
content_text = content_text.strip()
|
|
description = ""
|
|
if content_text:
|
|
# Get first paragraph or first 200 chars
|
|
first_para = content_text.split('\n\n')[0]
|
|
description = first_para[:200] + '...' if len(first_para) > 200 else first_para
|
|
|
|
# Generate parent directory information
|
|
parent_directory = None
|
|
if full_path.parent != DATA_DIR: # Don't show parent for root-level content
|
|
parent_path = full_path.parent
|
|
parent_display_name = parent_path.name.replace('-', ' ').replace('_', ' ').title()
|
|
parent_url = '/' + str(parent_path.relative_to(DATA_DIR))
|
|
parent_icon = generate_folder_icon(parent_display_name, size=20)
|
|
|
|
parent_directory = {
|
|
'display_name': parent_display_name,
|
|
'url': parent_url,
|
|
'icon': parent_icon
|
|
}
|
|
|
|
return render_template('post.html',
|
|
content=content_data['content'],
|
|
title=content_data['title'],
|
|
metadata=content_data['metadata'],
|
|
description=description,
|
|
breadcrumbs=breadcrumbs,
|
|
current_path=path,
|
|
current_year=datetime.now().year,
|
|
current_page=content_data['title'],
|
|
related_posts=related_posts,
|
|
reading_time=content_data.get('reading_time'),
|
|
word_count=content_data.get('word_count'),
|
|
prev_post=prev_post,
|
|
next_post=next_post,
|
|
tags=content_data.get('tags', []),
|
|
series_posts=content_data.get('series_posts', []),
|
|
series_name=content_data.get('series_name'),
|
|
unique_icon=content_data.get('unique_icon'),
|
|
parent_directory=parent_directory)
|
|
|
|
elif full_path.suffix.lower() in ['.jpg', '.jpeg', '.png', '.gif', '.webp']:
|
|
# Image file - check if it's in a gallery directory
|
|
parent_dir = full_path.parent
|
|
gallery_images = []
|
|
|
|
if parent_dir.exists():
|
|
for img in sorted(parent_dir.iterdir()):
|
|
if img.suffix.lower() in ['.jpg', '.jpeg', '.png', '.gif', '.webp']:
|
|
gallery_images.append({
|
|
'name': img.name,
|
|
'path': f"/static/data/{img.relative_to(DATA_DIR)}",
|
|
'url': f"/{img.relative_to(DATA_DIR)}",
|
|
'is_current': img == full_path
|
|
})
|
|
|
|
return render_template('photo.html',
|
|
image_path=f"/static/data/{path}",
|
|
title=full_path.stem.replace('-', ' ').replace('_', ' ').title(),
|
|
breadcrumbs=breadcrumbs,
|
|
gallery_images=gallery_images,
|
|
current_path=path,
|
|
current_year=datetime.now().year,
|
|
current_page=full_path.stem.replace('-', ' ').replace('_', ' ').title())
|
|
|
|
else:
|
|
# Other files - serve directly
|
|
from flask import send_file
|
|
return send_file(full_path)
|
|
|
|
@app.route('/static/data/<path:path>')
|
|
def serve_data_file(path):
|
|
"""Serve static files from the data directory."""
|
|
full_path = DATA_DIR / path
|
|
if not full_path.exists() or not full_path.is_file():
|
|
abort(404)
|
|
from flask import send_file, make_response
|
|
response = make_response(send_file(full_path))
|
|
|
|
# Add caching headers for static assets
|
|
if full_path.suffix.lower() in ['.jpg', '.jpeg', '.png', '.gif', '.webp', '.svg']:
|
|
# Cache images for 7 days
|
|
response.headers['Cache-Control'] = 'public, max-age=604800'
|
|
else:
|
|
# Cache other static files for 1 hour
|
|
response.headers['Cache-Control'] = 'public, max-age=3600'
|
|
|
|
return response
|
|
|
|
|
|
|
|
@lru_cache(maxsize=500)
|
|
def _get_article_icon_cached(article_path):
|
|
"""Cached function to get article icon data."""
|
|
# Normalize the path - ensure it starts with /
|
|
if not article_path.startswith('/'):
|
|
article_path = '/' + article_path
|
|
|
|
# Look up the article in blog_posts cache
|
|
posts = _collect_all_blog_posts_cached()
|
|
for post in posts:
|
|
if post['url'] == article_path:
|
|
if 'unique_icon' in post and post['unique_icon']:
|
|
return {
|
|
'success': True,
|
|
'icon': post['unique_icon'],
|
|
'title': post['title']
|
|
}
|
|
else:
|
|
# Generate icon if not cached
|
|
icon_svg = generate_unique_svg_icon(post['title'], size=20)
|
|
return {
|
|
'success': True,
|
|
'icon': icon_svg,
|
|
'title': post['title']
|
|
}
|
|
|
|
# If not found in blog_posts, try to read the file directly
|
|
try:
|
|
# Convert URL path to file path
|
|
file_path = DATA_DIR / article_path.lstrip('/')
|
|
|
|
# Check if it's a directory path (ends with / or is a directory)
|
|
if article_path.endswith('/') or file_path.is_dir():
|
|
# For directories, try to read index.md or use directory name
|
|
index_path = file_path / 'index.md' if file_path.is_dir() else file_path.with_name('index.md')
|
|
|
|
if index_path.exists():
|
|
# Read index.md to get the title
|
|
content_data = render_markdown_file(index_path)
|
|
if content_data and 'title' in content_data:
|
|
icon_svg = generate_folder_icon(content_data['title'], size=20)
|
|
return {
|
|
'success': True,
|
|
'icon': icon_svg,
|
|
'title': content_data['title']
|
|
}
|
|
|
|
# If no index.md, use the directory name
|
|
dir_name = file_path.name if file_path.is_dir() else article_path.strip('/').split('/')[-1]
|
|
title_guess = dir_name.replace('-', ' ').replace('_', ' ').title()
|
|
icon_svg = generate_folder_icon(title_guess, size=20)
|
|
return {
|
|
'success': True,
|
|
'icon': icon_svg,
|
|
'title': title_guess
|
|
}
|
|
else:
|
|
# Regular file handling
|
|
md_file_path = file_path.with_suffix('.md')
|
|
|
|
if md_file_path.exists():
|
|
# Read the file and extract the title
|
|
content_data = render_markdown_file(md_file_path)
|
|
if content_data and 'title' in content_data:
|
|
icon_svg = generate_unique_svg_icon(content_data['title'], size=20)
|
|
return {
|
|
'success': True,
|
|
'icon': icon_svg,
|
|
'title': content_data['title']
|
|
}
|
|
except Exception:
|
|
pass # Fall through to fallback
|
|
|
|
# Final fallback: extract title from URL and generate icon
|
|
title_guess = article_path.split('/')[-1].replace('-', ' ').replace('_', ' ').title()
|
|
if title_guess:
|
|
icon_svg = generate_unique_svg_icon(title_guess, size=20)
|
|
return {
|
|
'success': True,
|
|
'icon': icon_svg,
|
|
'title': title_guess
|
|
}
|
|
|
|
return {'success': False, 'error': 'Article not found'}
|
|
|
|
@app.route('/api/icon/<path:article_path>')
|
|
def api_get_article_icon(article_path):
|
|
"""API endpoint to get icon SVG for a specific article."""
|
|
try:
|
|
result = _get_article_icon_cached(article_path)
|
|
return jsonify(result)
|
|
|
|
except Exception as e:
|
|
return jsonify({'success': False, 'error': str(e)})
|
|
|
|
@app.route('/api/debug-cache')
|
|
def debug_cache():
|
|
"""Debug endpoint to see what's in the blog posts cache."""
|
|
try:
|
|
posts = _collect_all_blog_posts_cached()
|
|
# Filter for software posts
|
|
software_posts = [p for p in posts if 'software' in p.get('url', '')]
|
|
return jsonify({
|
|
'total_posts': len(posts),
|
|
'software_posts': software_posts[:5], # First 5 for debugging
|
|
'sample_urls': [p.get('url') for p in posts[:10]]
|
|
})
|
|
except Exception as e:
|
|
return jsonify({'error': str(e)})
|
|
|
|
@app.route('/api/search')
|
|
def api_search():
|
|
"""API endpoint for full-text search across the knowledge base."""
|
|
query = request.args.get('q', '').lower()
|
|
if not query:
|
|
return jsonify([])
|
|
|
|
results = []
|
|
|
|
def search_path(current_path: Path, display_path: str = ""):
|
|
"""Recursively search files and directories under ``current_path``.
|
|
|
|
This replaces the previous implementation that searched an in-memory
|
|
tree representation but never actually scanned the filesystem,
|
|
resulting in an empty search index. We now walk the ``data`` directory
|
|
directly so queries return real results.
|
|
"""
|
|
for item in current_path.iterdir():
|
|
if item.name.startswith('.'):
|
|
continue
|
|
|
|
relative_path = str(item.relative_to(DATA_DIR))
|
|
node_name = item.name.lower()
|
|
node_path = relative_path.lower()
|
|
node_content = ""
|
|
|
|
if item.is_file() and item.suffix == '.md':
|
|
try:
|
|
node_content = item.read_text(encoding='utf-8').lower()
|
|
except Exception:
|
|
node_content = ""
|
|
|
|
item_display_path = f"{display_path}/{item.name}" if display_path else item.name
|
|
|
|
if query in node_name or query in node_path or query in node_content:
|
|
# Generate snippet with highlighted search terms for markdown files
|
|
snippet = ""
|
|
if item.suffix == '.md' and node_content and query in node_content:
|
|
# Find the first occurrence of the query in content
|
|
query_pos = node_content.find(query)
|
|
if query_pos != -1:
|
|
# Extract context around the query (200 chars before and after)
|
|
start = max(0, query_pos - 100)
|
|
end = min(len(node_content), query_pos + len(query) + 100)
|
|
snippet_text = node_content[start:end]
|
|
|
|
# Clean up the snippet (remove markdown syntax)
|
|
import re
|
|
snippet_text = re.sub(r'[#*`_\[\]()]', '', snippet_text)
|
|
snippet_text = re.sub(r'\s+', ' ', snippet_text).strip()
|
|
|
|
# Highlight the search term (case-insensitive)
|
|
snippet = re.sub(f'({re.escape(query)})', r'<mark>\1</mark>', snippet_text, flags=re.IGNORECASE)
|
|
|
|
# Add ellipsis if snippet is truncated
|
|
if start > 0:
|
|
snippet = "..." + snippet
|
|
if end < len(node_content):
|
|
snippet = snippet + "..."
|
|
|
|
result = {
|
|
'name': item.name,
|
|
'type': 'directory' if item.is_dir() else ('article' if item.suffix == '.md' else 'file'),
|
|
'path': relative_path,
|
|
'display_path': item_display_path,
|
|
'snippet': snippet,
|
|
'relevance': 0,
|
|
}
|
|
|
|
# Add unique_icon for articles
|
|
if item.suffix == '.md':
|
|
try:
|
|
# Convert path to URL for lookup in blog_posts
|
|
clean_url = '/' + relative_path[:-3] # Remove .md extension
|
|
blog_posts = metadata_cache.get_blog_posts()
|
|
for post in blog_posts:
|
|
if post['url'] == clean_url:
|
|
if 'unique_icon' in post:
|
|
result['unique_icon'] = post['unique_icon']
|
|
break
|
|
except Exception:
|
|
pass
|
|
|
|
relevance = 0
|
|
if query in node_name:
|
|
relevance += 10
|
|
if node_name.startswith(query):
|
|
relevance += 5
|
|
if query in node_path:
|
|
relevance += 3
|
|
if query in node_content:
|
|
relevance += 1
|
|
relevance += node_content.count(query) * 0.1
|
|
|
|
result['relevance'] = relevance
|
|
results.append(result)
|
|
|
|
if item.is_dir():
|
|
search_path(item, item_display_path)
|
|
|
|
# Start searching from the data directory
|
|
search_path(DATA_DIR)
|
|
|
|
results.sort(key=lambda x: x['relevance'], reverse=True)
|
|
return jsonify(results)
|
|
|
|
|
|
|
|
def collect_blog_posts():
|
|
"""Collect blog posts from essays and AI writings for RSS feed."""
|
|
posts = []
|
|
|
|
# Define blog post directories
|
|
blog_dirs = [
|
|
DATA_DIR / 'essays',
|
|
DATA_DIR / 'artificial-intelligence' # This will pick up root AI posts and scan subdirs
|
|
]
|
|
|
|
def scan_for_posts(path, category=""):
|
|
if not path.exists() or not path.is_dir():
|
|
return
|
|
|
|
for item in sorted(path.iterdir(), reverse=True): # Most recent first
|
|
if item.name.startswith('.') or item.name.lower() == 'index.md':
|
|
continue
|
|
|
|
if item.is_file() and item.suffix == '.md':
|
|
# Get post data
|
|
try:
|
|
content_data = render_markdown_file(item)
|
|
|
|
# Extract publication date using intelligent extraction
|
|
pub_date = extract_intelligent_date(item, content_data)
|
|
|
|
# Skip posts without determinable dates (no filename date, no YAML date, no content date)
|
|
if pub_date is None:
|
|
continue
|
|
|
|
# Create clean URL
|
|
relative_path = str(item.relative_to(DATA_DIR))
|
|
clean_url = '/' + relative_path[:-3] # Remove .md extension
|
|
|
|
# Extract description from raw markdown (before HTML conversion)
|
|
description = ""
|
|
try:
|
|
with open(item, 'r', encoding='utf-8') as f:
|
|
raw_markdown = f.read()
|
|
|
|
# Skip front matter if present
|
|
if raw_markdown.startswith('---'):
|
|
parts = raw_markdown.split('---', 2)
|
|
if len(parts) >= 3:
|
|
raw_markdown = parts[2].strip()
|
|
|
|
# Split into lines and clean up, then find first meaningful content
|
|
lines = [line.strip() for line in raw_markdown.split('\n') if line.strip()]
|
|
|
|
# Find first line that contains substantial text content
|
|
for line in lines:
|
|
# Skip headers
|
|
if re.match(r'^\s*#{1,6}\s', line):
|
|
continue
|
|
# Skip images
|
|
if re.match(r'^\s*!\[[^\]]*\]\([^)]*\)\s*$', line):
|
|
continue
|
|
# Skip image references
|
|
if re.match(r'^\s*\[Image #\d+\]\s*$', line):
|
|
continue
|
|
# Skip date/metadata lines
|
|
if re.match(r'^\s*\*[^*]*\*\s*$', line):
|
|
continue
|
|
# Skip horizontal rules
|
|
if re.match(r'^\s*[-*_]{3,}\s*$', line):
|
|
continue
|
|
|
|
# Clean up markdown formatting in the line
|
|
clean_line = re.sub(r'!\[[^\]]*\]\([^)]*\)', '', line) # Remove images
|
|
clean_line = re.sub(r'\[([^\]]*)\]\([^)]*\)', r'\1', clean_line) # Convert links to text
|
|
clean_line = re.sub(r'[*_]{1,3}([^*_]+)[*_]{1,3}', r'\1', clean_line) # Remove bold/italic
|
|
clean_line = re.sub(r'`([^`]+)`', r'\1', clean_line) # Remove code formatting
|
|
clean_line = re.sub(r'#{1,6}\s*', '', clean_line) # Remove header markers
|
|
clean_line = clean_line.strip()
|
|
|
|
if clean_line and len(clean_line) > 20: # Must have substantial content
|
|
description = clean_line[:150] + '...' if len(clean_line) > 150 else clean_line
|
|
break
|
|
except Exception:
|
|
# Fallback to HTML method if raw reading fails
|
|
content_text = re.sub(r'<[^>]+>', '', content_data['content'])
|
|
if content_text.strip():
|
|
description = content_text.strip()[:150] + '...'
|
|
|
|
posts.append({
|
|
'title': content_data['title'],
|
|
'url': clean_url,
|
|
'description': description,
|
|
'pub_date': pub_date,
|
|
'category': category or item.parent.name.replace('-', ' ').title(),
|
|
'content': content_data['content'][:1000] + '...' if len(content_data['content']) > 1000 else content_data['content']
|
|
})
|
|
except Exception:
|
|
continue
|
|
elif item.is_dir():
|
|
# Recursively scan subdirectories
|
|
scan_for_posts(item, category or item.name.replace('-', ' ').title())
|
|
|
|
# Scan each blog directory
|
|
for blog_dir in blog_dirs:
|
|
if blog_dir.exists():
|
|
category = blog_dir.name.replace('-', ' ').title()
|
|
if 'artificial-intelligence' in str(blog_dir):
|
|
category = 'AI & Consciousness'
|
|
scan_for_posts(blog_dir, category)
|
|
|
|
# Sort by publication date (most recent first)
|
|
posts.sort(key=lambda x: x['pub_date'], reverse=True)
|
|
|
|
return posts[:20] # Return most recent 20 posts
|
|
|
|
|
|
# Cache with TTL - cleared when date extraction logic changes
|
|
_blog_posts_cache = {'data': None, 'timestamp': 0}
|
|
_sidenotes_cache = {'data': None, 'timestamp': 0}
|
|
_outlines_cache = {'data': None, 'timestamp': 0}
|
|
_quotes_cache = {'data': None, 'timestamp': 0}
|
|
_connections_cache = {'data': None, 'timestamp': 0}
|
|
_external_links_cache = {'data': None, 'timestamp': 0}
|
|
_terms_cache = {'data': None, 'timestamp': 0}
|
|
CACHE_TTL = 36000 # 10 hours cache
|
|
|
|
# Force cache invalidation for filename change
|
|
import time
|
|
_force_cache_clear = time.time() # Line-by-line filtering instead of paragraph-based
|
|
|
|
|
|
# Initialize unified cache on module load
|
|
class MetadataCache:
|
|
"""Clean interface to site metadata cache."""
|
|
|
|
def __init__(self):
|
|
self._data = None
|
|
|
|
def initialize(self):
|
|
"""Load all site metadata in a single scan."""
|
|
print("Starting unified cache generation...")
|
|
self._data = _generate_all_caches_unified()
|
|
print("Unified cache generation completed!")
|
|
|
|
def get_sidenotes(self):
|
|
"""Get all sidenotes with metadata."""
|
|
if not self._data:
|
|
return {'articles': [], 'total_count': 0}
|
|
|
|
sidenotes_data = self._data['sidenotes']['articles'] # {file_path: [sidenotes]}
|
|
|
|
# Create file metadata lookup from blog_posts (fast dictionary lookup)
|
|
file_metadata = {}
|
|
for post in self._data.get('blog_posts', []):
|
|
# Convert URL back to file path for lookup
|
|
file_path = 'data' + post['url'] + '.md'
|
|
file_metadata[file_path] = post
|
|
|
|
articles = []
|
|
for file_path, sidenotes in sidenotes_data.items():
|
|
if not sidenotes:
|
|
continue
|
|
|
|
# Use pre-computed metadata instead of re-processing files
|
|
metadata = file_metadata.get(file_path)
|
|
if metadata:
|
|
articles.append({
|
|
'title': metadata['title'],
|
|
'url': metadata['url'],
|
|
'date': metadata.get('pub_date'),
|
|
'category': metadata['category'].replace('-', ' ').title(),
|
|
'sidenotes': [{'text': s['text'], 'id': s.get('id')} for s in sidenotes],
|
|
'unique_icon': metadata.get('unique_icon')
|
|
})
|
|
|
|
# Sort by date (most recent first)
|
|
articles.sort(key=lambda x: x['date'] if x['date'] else datetime(1900, 1, 1), reverse=True)
|
|
|
|
return {
|
|
'articles': articles,
|
|
'total_count': self._data['sidenotes']['total_count']
|
|
}
|
|
|
|
def get_outlines(self):
|
|
"""Get all outlines with metadata."""
|
|
if not self._data:
|
|
return {'articles': [], 'total_count': 0}
|
|
|
|
outlines_data = self._data['outlines']['articles'] # {file_path: [outlines]}
|
|
|
|
# Create file metadata lookup from blog_posts (fast dictionary lookup)
|
|
file_metadata = {}
|
|
for post in self._data.get('blog_posts', []):
|
|
# Convert URL back to file path for lookup
|
|
file_path = 'data' + post['url'] + '.md'
|
|
file_metadata[file_path] = post
|
|
|
|
articles = []
|
|
for file_path, outlines in outlines_data.items():
|
|
if not outlines:
|
|
continue
|
|
|
|
# Use pre-computed metadata instead of re-processing files
|
|
metadata = file_metadata.get(file_path)
|
|
if metadata:
|
|
# Process headings to extract IDs and create anchor URLs
|
|
processed_headings = []
|
|
for o in outlines:
|
|
# Always generate an ID from the text to ensure links work
|
|
import re
|
|
heading_id = re.sub(r'[^\w\s-]', '', o['text'].lower())
|
|
heading_id = re.sub(r'[-\s]+', '-', heading_id).strip('-')
|
|
|
|
# Try to extract ID from HTML if present (preferred)
|
|
if 'html' in o and o['html']:
|
|
id_match = re.search(r'id="([^"]*)"', o['html'])
|
|
if id_match and id_match.group(1):
|
|
heading_id = id_match.group(1)
|
|
|
|
processed_headings.append({
|
|
'level': int(o['level']),
|
|
'text': o['text'],
|
|
'id': heading_id,
|
|
'anchor_url': f"{metadata['url']}#{heading_id}"
|
|
})
|
|
|
|
articles.append({
|
|
'title': metadata['title'],
|
|
'url': metadata['url'],
|
|
'date': metadata.get('pub_date'),
|
|
'category': metadata['category'].replace('-', ' ').title(),
|
|
'headings': processed_headings,
|
|
'unique_icon': metadata.get('unique_icon')
|
|
})
|
|
|
|
# Sort by date (most recent first)
|
|
articles.sort(key=lambda x: x['date'] if x['date'] else datetime(1900, 1, 1), reverse=True)
|
|
|
|
return {
|
|
'articles': articles,
|
|
'total_count': self._data['outlines']['total_count']
|
|
}
|
|
|
|
def get_quotes(self):
|
|
"""Get all quotes with metadata."""
|
|
if not self._data:
|
|
return {'articles': [], 'total_count': 0}
|
|
|
|
quotes_data = self._data['quotes']['articles']
|
|
|
|
# Create file metadata lookup from blog_posts (fast dictionary lookup)
|
|
file_metadata = {}
|
|
for post in self._data.get('blog_posts', []):
|
|
# Convert URL back to file path for lookup
|
|
file_path = 'data' + post['url'] + '.md'
|
|
file_metadata[file_path] = post
|
|
|
|
articles = []
|
|
for file_path, quotes in quotes_data.items():
|
|
if not quotes:
|
|
continue
|
|
|
|
# Use pre-computed metadata instead of re-processing files
|
|
metadata = file_metadata.get(file_path)
|
|
if metadata:
|
|
articles.append({
|
|
'title': metadata['title'],
|
|
'url': metadata['url'],
|
|
'date': metadata.get('pub_date'),
|
|
'category': metadata['category'].replace('-', ' ').title(),
|
|
'quotes': [q['text'] for q in quotes],
|
|
'unique_icon': metadata.get('unique_icon')
|
|
})
|
|
|
|
articles.sort(key=lambda x: x['date'] if x['date'] else datetime(1900, 1, 1), reverse=True)
|
|
|
|
return {
|
|
'articles': articles,
|
|
'total_count': self._data['quotes']['total_count']
|
|
}
|
|
|
|
def get_connections(self):
|
|
"""Get all connections with metadata in template-expected format."""
|
|
if not self._data:
|
|
return {'articles': [], 'total_count': 0, 'total_outgoing': 0, 'total_incoming': 0}
|
|
|
|
connections_cache = self._data['connections']
|
|
outgoing_refs = connections_cache.get('outgoing_refs', {})
|
|
incoming_refs = connections_cache.get('incoming_refs', {})
|
|
|
|
print(f"DEBUG: get_connections - outgoing_refs has {len(outgoing_refs)} files")
|
|
print(f"DEBUG: get_connections - incoming_refs has {len(incoming_refs)} refs")
|
|
|
|
# Create URL to metadata lookup from blog_posts (fast dictionary lookup)
|
|
url_metadata = {}
|
|
file_to_url = {}
|
|
for post in self._data.get('blog_posts', []):
|
|
url_metadata[post['url']] = post
|
|
# Check for both possible file path keys
|
|
file_path = post.get('file_path') or post.get('path')
|
|
if file_path:
|
|
file_to_url[file_path] = post['url']
|
|
|
|
# Build articles with their connections
|
|
articles = []
|
|
|
|
print(f"DEBUG: file_to_url mapping has {len(file_to_url)} entries")
|
|
|
|
# Process outgoing connections by file path
|
|
for file_path, outgoing_list in outgoing_refs.items():
|
|
article_url = file_to_url.get(file_path)
|
|
if not article_url:
|
|
print(f"DEBUG: No URL found for file_path: {file_path}")
|
|
continue
|
|
|
|
metadata = url_metadata.get(article_url)
|
|
if not metadata:
|
|
print(f"DEBUG: No metadata found for article_url: {article_url}")
|
|
continue
|
|
|
|
# Build outgoing connections with proper target_url and link_text
|
|
processed_outgoing = []
|
|
for conn in outgoing_list:
|
|
processed_outgoing.append({
|
|
'target_url': conn['url'],
|
|
'link_text': conn['text']
|
|
})
|
|
|
|
# Get incoming connections for this article
|
|
incoming_list = incoming_refs.get(article_url, [])
|
|
processed_incoming = []
|
|
for conn in incoming_list:
|
|
# Find source metadata
|
|
source_url = file_to_url.get(conn['source_file'])
|
|
source_metadata = url_metadata.get(source_url) if source_url else None
|
|
|
|
# If no URL mapping found, try to extract title from file
|
|
source_title = 'Unknown'
|
|
if source_metadata:
|
|
source_title = source_metadata['title']
|
|
else:
|
|
# Try to extract title from the file itself
|
|
try:
|
|
file_path = conn['source_file']
|
|
if os.path.exists(file_path):
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
# Look for markdown title (# Title)
|
|
for line in content.split('\n')[:10]: # Check first 10 lines
|
|
line = line.strip()
|
|
if line.startswith('# '):
|
|
source_title = line[2:].strip()
|
|
break
|
|
elif line.startswith('title:'): # YAML frontmatter
|
|
source_title = line[6:].strip().strip('"\'')
|
|
break
|
|
except Exception:
|
|
pass # Keep 'Unknown' if file reading fails
|
|
|
|
processed_incoming.append({
|
|
'source_url': source_url or conn['source_file'],
|
|
'source_title': source_title,
|
|
'link_text': conn['text']
|
|
})
|
|
|
|
# Only include articles that have connections
|
|
if processed_outgoing or processed_incoming:
|
|
articles.append({
|
|
'title': metadata['title'],
|
|
'url': article_url,
|
|
'date': metadata.get('pub_date'),
|
|
'category': metadata['category'].replace('-', ' ').title(),
|
|
'connections': processed_outgoing, # For backward compatibility
|
|
'outgoing_connections': processed_outgoing,
|
|
'incoming_connections': processed_incoming,
|
|
'unique_icon': metadata.get('unique_icon')
|
|
})
|
|
|
|
# Sort by date (most recent first)
|
|
articles.sort(key=lambda x: x['date'] if x['date'] else datetime(1900, 1, 1), reverse=True)
|
|
|
|
# Calculate totals
|
|
total_outgoing = sum(len(article['outgoing_connections']) for article in articles)
|
|
total_incoming = sum(len(article['incoming_connections']) for article in articles)
|
|
|
|
return {
|
|
'articles': articles,
|
|
'total_count': total_outgoing + total_incoming,
|
|
'total_outgoing': total_outgoing,
|
|
'total_incoming': total_incoming
|
|
}
|
|
|
|
def get_terms(self):
|
|
"""Get all terms with metadata."""
|
|
if not self._data:
|
|
return {'terms': [], 'total_terms': 0, 'total_occurrences': 0}
|
|
|
|
terms_data = self._data['terms']
|
|
return {
|
|
'terms': terms_data['terms'],
|
|
'total_terms': len(terms_data['terms']),
|
|
'total_occurrences': terms_data['total_occurrences']
|
|
}
|
|
|
|
def get_blog_posts(self):
|
|
"""Get all blog posts from unified cache."""
|
|
if not self._data:
|
|
return []
|
|
|
|
return self._data.get('blog_posts', [])
|
|
|
|
# Global metadata cache instance
|
|
metadata_cache = MetadataCache()
|
|
|
|
def initialize_unified_cache():
|
|
"""Initialize unified cache at startup."""
|
|
global _blog_posts_cache, _sidenotes_cache, _outlines_cache
|
|
global _quotes_cache, _connections_cache, _terms_cache
|
|
|
|
# Initialize the clean metadata cache
|
|
metadata_cache.initialize()
|
|
|
|
def extract_intelligent_date(item_path, content_data=None):
|
|
"""Extract date intelligently, prioritizing filename patterns as requested."""
|
|
pub_date = None
|
|
|
|
# 1. PRIORITY: Try full YYYY-MM-DD format anywhere in filename first
|
|
date_match = re.search(r'(\d{4}-\d{2}-\d{2})', item_path.name)
|
|
if date_match:
|
|
try:
|
|
pub_date = datetime.strptime(date_match.group(1), '%Y-%m-%d')
|
|
return pub_date
|
|
except:
|
|
pass
|
|
|
|
# 2. Try YYYY-MM format at start of filename
|
|
date_match = re.match(r'(\d{4}-\d{2})', item_path.stem)
|
|
if date_match:
|
|
try:
|
|
# Extract day from content if present, otherwise use first of month
|
|
day = 1
|
|
try:
|
|
with open(item_path, 'r', encoding='utf-8') as f:
|
|
content_preview = f.read(1000)
|
|
day_match = re.search(r'(\d{4}-\d{2}-(\d{2}))', content_preview)
|
|
if day_match:
|
|
day = int(day_match.group(2))
|
|
except:
|
|
pass
|
|
|
|
pub_date = datetime.strptime(date_match.group(1) + f'-{day:02d}', '%Y-%m-%d')
|
|
return pub_date
|
|
except:
|
|
pass
|
|
|
|
# 3. Try just year at start of filename (YYYY)
|
|
year_match = re.match(r'(\d{4})', item_path.stem)
|
|
if year_match:
|
|
try:
|
|
# Try to get month from content, otherwise use January
|
|
year = int(year_match.group(1))
|
|
month = 1
|
|
day = 1
|
|
|
|
try:
|
|
with open(item_path, 'r', encoding='utf-8') as f:
|
|
first_few_lines = ''.join(f.readlines()[:10])
|
|
|
|
# Look for "*Month YYYY*" pattern in content
|
|
month_match = re.search(r'\*([A-Za-z]+)\s+' + str(year) + r'\*', first_few_lines)
|
|
if month_match:
|
|
month_name = month_match.group(1)
|
|
month = datetime.strptime(month_name, '%B').month
|
|
except:
|
|
pass
|
|
|
|
pub_date = datetime(year, month, day)
|
|
return pub_date
|
|
except:
|
|
pass
|
|
|
|
# 4. Check YAML front matter for date (lower priority now)
|
|
if content_data and content_data['metadata'].get('date'):
|
|
try:
|
|
if isinstance(content_data['metadata']['date'], list):
|
|
pub_date = datetime.strptime(content_data['metadata']['date'][0], '%Y-%m-%d')
|
|
else:
|
|
pub_date = datetime.strptime(str(content_data['metadata']['date']), '%Y-%m-%d')
|
|
return pub_date
|
|
except:
|
|
pass
|
|
|
|
# 5. Check for date in content (look for *Month YYYY* pattern)
|
|
try:
|
|
with open(item_path, 'r', encoding='utf-8') as f:
|
|
first_few_lines = ''.join(f.readlines()[:10])
|
|
|
|
# Look for patterns like "*January 2025*" or "*Month YYYY*"
|
|
month_year_match = re.search(r'\*([A-Za-z]+\s+\d{4})\*', first_few_lines)
|
|
if month_year_match:
|
|
try:
|
|
pub_date = datetime.strptime(month_year_match.group(1), '%B %Y')
|
|
# Set to first day of month for month-only dates
|
|
pub_date = pub_date.replace(day=1)
|
|
return pub_date
|
|
except:
|
|
pass
|
|
except:
|
|
pass
|
|
|
|
# 6. Final fallback: if no date found anywhere, return None
|
|
# (Removed file creation time fallback due to deployment issues)
|
|
return None
|
|
|
|
|
|
def _collect_all_blog_posts_cached():
|
|
"""Internal cached function to collect all blog posts with TTL."""
|
|
current_time = time.time()
|
|
|
|
# Check if cache is valid
|
|
if (_blog_posts_cache['data'] is not None and
|
|
current_time - _blog_posts_cache['timestamp'] < CACHE_TTL and
|
|
_blog_posts_cache['timestamp'] > _force_cache_clear):
|
|
return _blog_posts_cache['data']
|
|
|
|
# Cache miss or expired - rebuild
|
|
posts = []
|
|
|
|
# Define blog post directories
|
|
blog_dirs = [
|
|
DATA_DIR / 'essays',
|
|
DATA_DIR / 'artificial-intelligence', # This will pick up root AI posts and scan subdirs
|
|
DATA_DIR / 'software',
|
|
DATA_DIR / 'poetry',
|
|
DATA_DIR / 'talks',
|
|
DATA_DIR / 'themes'
|
|
]
|
|
|
|
def scan_for_posts(path, category=""):
|
|
if not path.exists() or not path.is_dir():
|
|
return
|
|
|
|
for item in sorted(path.iterdir(), reverse=True): # Most recent first
|
|
if item.name.startswith('.') or item.name.lower() == 'index.md':
|
|
continue
|
|
|
|
if item.is_file() and item.suffix == '.md':
|
|
# Get post data
|
|
try:
|
|
content_data = render_markdown_file(item)
|
|
|
|
# Extract publication date using intelligent extraction
|
|
pub_date = extract_intelligent_date(item, content_data)
|
|
|
|
# Skip posts without determinable dates (no filename date, no YAML date, no content date)
|
|
if pub_date is None:
|
|
continue
|
|
|
|
# Create clean URL
|
|
relative_path = str(item.relative_to(DATA_DIR))
|
|
clean_url = '/' + relative_path[:-3] # Remove .md extension
|
|
|
|
# Extract description from raw markdown (before HTML conversion)
|
|
description = ""
|
|
try:
|
|
with open(item, 'r', encoding='utf-8') as f:
|
|
raw_markdown = f.read()
|
|
|
|
# Skip front matter if present
|
|
if raw_markdown.startswith('---'):
|
|
parts = raw_markdown.split('---', 2)
|
|
if len(parts) >= 3:
|
|
raw_markdown = parts[2].strip()
|
|
|
|
# Split into lines and clean up, then find first meaningful content
|
|
lines = [line.strip() for line in raw_markdown.split('\n') if line.strip()]
|
|
|
|
# Find first line that contains substantial text content
|
|
for line in lines:
|
|
# Skip headers
|
|
if re.match(r'^\s*#{1,6}\s', line):
|
|
continue
|
|
# Skip images
|
|
if re.match(r'^\s*!\[[^\]]*\]\([^)]*\)\s*$', line):
|
|
continue
|
|
# Skip image references
|
|
if re.match(r'^\s*\[Image #\d+\]\s*$', line):
|
|
continue
|
|
# Skip date/metadata lines
|
|
if re.match(r'^\s*\*[^*]*\*\s*$', line):
|
|
continue
|
|
# Skip horizontal rules
|
|
if re.match(r'^\s*[-*_]{3,}\s*$', line):
|
|
continue
|
|
|
|
# Clean up markdown formatting in the line
|
|
clean_line = re.sub(r'!\[[^\]]*\]\([^)]*\)', '', line) # Remove images
|
|
clean_line = re.sub(r'\[([^\]]*)\]\([^)]*\)', r'\1', clean_line) # Convert links to text
|
|
clean_line = re.sub(r'[*_]{1,3}([^*_]+)[*_]{1,3}', r'\1', clean_line) # Remove bold/italic
|
|
clean_line = re.sub(r'`([^`]+)`', r'\1', clean_line) # Remove code formatting
|
|
clean_line = re.sub(r'#{1,6}\s*', '', clean_line) # Remove header markers
|
|
clean_line = clean_line.strip()
|
|
|
|
if clean_line and len(clean_line) > 20: # Must have substantial content
|
|
description = clean_line[:150] + '...' if len(clean_line) > 150 else clean_line
|
|
break
|
|
except Exception:
|
|
# Fallback to HTML method if raw reading fails
|
|
content_text = re.sub(r'<[^>]+>', '', content_data['content'])
|
|
if content_text.strip():
|
|
description = content_text.strip()[:150] + '...'
|
|
|
|
posts.append({
|
|
'title': content_data['title'],
|
|
'url': clean_url,
|
|
'description': description,
|
|
'pub_date': pub_date,
|
|
'category': category or item.parent.name.replace('-', ' ').title(),
|
|
'content': content_data['content'][:1000] + '...' if len(content_data['content']) > 1000 else content_data['content']
|
|
})
|
|
except Exception:
|
|
continue
|
|
elif item.is_dir():
|
|
# Recursively scan subdirectories
|
|
scan_for_posts(item, category or item.name.replace('-', ' ').title())
|
|
|
|
# Scan each blog directory
|
|
for blog_dir in blog_dirs:
|
|
if blog_dir.exists():
|
|
category = blog_dir.name.replace('-', ' ').title()
|
|
if 'artificial-intelligence' in str(blog_dir):
|
|
category = 'AI & Consciousness'
|
|
scan_for_posts(blog_dir, category)
|
|
|
|
# Sort by publication date (most recent first)
|
|
posts.sort(key=lambda x: x['pub_date'], reverse=True)
|
|
|
|
# Update cache
|
|
result = tuple(posts)
|
|
_blog_posts_cache['data'] = result
|
|
_blog_posts_cache['timestamp'] = time.time()
|
|
|
|
return result
|
|
|
|
|
|
def collect_all_blog_posts():
|
|
"""Public function to collect all blog posts - converts cached tuple back to list."""
|
|
return list(_collect_all_blog_posts_cached())
|
|
|
|
|
|
def preload_blog_posts():
|
|
"""Preload blog posts cache at startup for faster initial page loads."""
|
|
print("Preloading blog posts cache...")
|
|
start_time = time.time()
|
|
posts = _collect_all_blog_posts_cached()
|
|
load_time = time.time() - start_time
|
|
print(f"Loaded {len(posts)} posts in {load_time:.2f}s")
|
|
|
|
|
|
def preload_sidenotes():
|
|
"""Preload sidenotes cache at startup for faster initial page loads."""
|
|
print("Preloading sidenotes cache...")
|
|
start_time = time.time()
|
|
sidenotes_data = _extract_all_sidenotes_cached()
|
|
load_time = time.time() - start_time
|
|
print(f"Extracted {sidenotes_data['total_count']} sidenotes from {len(sidenotes_data['articles'])} articles in {load_time:.2f}s")
|
|
|
|
|
|
def preload_outlines():
|
|
"""Preload outlines cache at startup for faster initial page loads."""
|
|
print("Preloading outlines cache...")
|
|
start_time = time.time()
|
|
outlines_data = _extract_all_outlines_cached()
|
|
load_time = time.time() - start_time
|
|
print(f"Extracted {outlines_data['total_count']} headings from {len(outlines_data['articles'])} articles in {load_time:.2f}s")
|
|
|
|
|
|
def preload_quotes():
|
|
"""Preload quotes cache at startup for faster initial page loads."""
|
|
print("Preloading quotes cache...")
|
|
start_time = time.time()
|
|
quotes_data = _extract_all_quotes_cached()
|
|
load_time = time.time() - start_time
|
|
print(f"Extracted {quotes_data['total_count']} quotes from {len(quotes_data['articles'])} articles in {load_time:.2f}s")
|
|
|
|
|
|
def preload_connections():
|
|
"""Preload connections cache at startup for faster initial page loads."""
|
|
print("Preloading connections cache...")
|
|
start_time = time.time()
|
|
connections_data = _extract_all_connections_cached()
|
|
load_time = time.time() - start_time
|
|
print(f"Extracted {connections_data['total_count']} cross-references in {load_time:.2f}s")
|
|
|
|
|
|
def _extract_all_external_links_cached():
|
|
"""Extract all external links from articles with 10-hour TTL cache."""
|
|
current_time = time.time()
|
|
|
|
# Check if cache is still valid (10 hour TTL)
|
|
if (_external_links_cache['data'] is not None and
|
|
current_time - _external_links_cache['timestamp'] < CACHE_TTL and
|
|
_external_links_cache['timestamp'] > _force_cache_clear):
|
|
return _external_links_cache['data']
|
|
|
|
posts = _collect_all_blog_posts_cached()
|
|
articles_with_links = []
|
|
total_count = 0
|
|
domain_counts = defaultdict(int)
|
|
|
|
# Pattern to match external links (http/https URLs that don't start with current domain)
|
|
external_link_pattern = r'<a[^>]*href="(https?://[^"]*)"[^>]*>(.*?)</a>'
|
|
|
|
for post in posts:
|
|
external_links = []
|
|
|
|
# Find all external links in content
|
|
matches = re.findall(external_link_pattern, post['content'], re.IGNORECASE | re.DOTALL)
|
|
|
|
for url, link_text in matches:
|
|
# Skip internal links (adjust domain as needed)
|
|
if 'kennethreitz.org' not in url:
|
|
# Clean link text
|
|
clean_text = re.sub(r'<[^>]+>', '', link_text).strip()
|
|
if not clean_text:
|
|
clean_text = url
|
|
|
|
# Extract domain for stats
|
|
domain = re.match(r'https?://(?:www\.)?([^/]+)', url)
|
|
if domain:
|
|
domain_counts[domain.group(1)] += 1
|
|
|
|
external_links.append({
|
|
'url': url,
|
|
'link_text': clean_text[:100], # Truncate very long link text
|
|
'domain': domain.group(1) if domain else 'unknown'
|
|
})
|
|
|
|
if external_links:
|
|
articles_with_links.append({
|
|
'title': post['title'],
|
|
'url': post['url'],
|
|
'date': post.get('date'),
|
|
'category': post.get('category', 'Unknown'),
|
|
'external_links': external_links
|
|
})
|
|
total_count += len(external_links)
|
|
|
|
# Sort articles by publication date (most recent first)
|
|
articles_with_links.sort(key=lambda x: x['date'] or datetime.min, reverse=True)
|
|
|
|
# Sort domains by frequency
|
|
top_domains = sorted(domain_counts.items(), key=lambda x: x[1], reverse=True)
|
|
|
|
result = {
|
|
'articles': articles_with_links,
|
|
'total_count': total_count,
|
|
'domain_stats': top_domains
|
|
}
|
|
|
|
# Cache the result
|
|
_external_links_cache['data'] = result
|
|
_external_links_cache['timestamp'] = current_time
|
|
|
|
return result
|
|
|
|
|
|
def preload_external_links():
|
|
"""Preload external links cache at startup for faster initial page loads."""
|
|
print("Preloading external links cache...")
|
|
start_time = time.time()
|
|
links_data = _extract_all_external_links_cached()
|
|
load_time = time.time() - start_time
|
|
print(f"Extracted {links_data['total_count']} external links from {len(links_data['articles'])} articles in {load_time:.2f}s")
|
|
|
|
|
|
def _extract_all_terms_cached():
|
|
"""Return pre-loaded terms cache data (pure RAM, no TTL)."""
|
|
# Return pre-loaded cache data if available
|
|
if _terms_cache['data'] is not None:
|
|
return _terms_cache['data']
|
|
|
|
posts = _collect_all_blog_posts_cached()
|
|
term_occurrences = defaultdict(list) # term -> [(article_title, article_url, count)]
|
|
|
|
# Common stop words to filter out
|
|
stop_words = {
|
|
'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with',
|
|
'by', 'from', 'as', 'an', 'a', 'is', 'was', 'are', 'were', 'be', 'been',
|
|
'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should',
|
|
'may', 'might', 'can', 'this', 'that', 'these', 'those', 'i', 'you', 'he',
|
|
'she', 'it', 'we', 'they', 'me', 'him', 'her', 'us', 'them', 'my', 'your',
|
|
'his', 'its', 'our', 'their', 'not', 'all', 'some', 'any', 'each', 'every',
|
|
'one', 'two', 'if', 'then', 'so', 'when', 'where', 'how', 'why', 'what',
|
|
'about', 'into', 'through', 'during', 'before', 'after', 'above', 'below',
|
|
'up', 'down', 'out', 'off', 'over', 'under', 'again', 'further', 'then',
|
|
'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both',
|
|
'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'only',
|
|
'own', 'same', 'than', 'too', 'very', 'just', 'now', 'also', 'often', 'really',
|
|
'much', 'many', 'way', 'well', 'even', 'still', 'get', 'go', 'come', 'make',
|
|
'take', 'know', 'see', 'think', 'say', 'work', 'feel', 'look', 'seem', 'want',
|
|
'use', 'find', 'give', 'tell', 'ask', 'try', 'help', 'need', 'become', 'turn',
|
|
'start', 'show', 'hear', 'play', 'run', 'move', 'live', 'believe', 'hold',
|
|
'bring', 'happen', 'write', 'provide', 'sit', 'stand', 'lose', 'pay', 'meet'
|
|
}
|
|
|
|
# Technical terms that should always be included
|
|
important_terms = {
|
|
'API', 'HTTP', 'Python', 'JavaScript', 'AI', 'ML', 'consciousness', 'algorithm',
|
|
'Requests', 'Flask', 'Django', 'GitHub', 'software', 'programming', 'technology',
|
|
'artificial intelligence', 'machine learning', 'open source', 'philosophy'
|
|
}
|
|
|
|
for post in posts:
|
|
# Clean content - remove HTML tags and get plain text
|
|
import re
|
|
clean_content = re.sub(r'<[^>]+>', ' ', post['content'])
|
|
clean_content = re.sub(r'\s+', ' ', clean_content)
|
|
|
|
# Extract potential terms using multiple strategies
|
|
terms_in_post = defaultdict(int)
|
|
|
|
# Strategy 1: Capitalized words/phrases (likely proper nouns, concepts)
|
|
capitalized_terms = re.findall(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b', clean_content)
|
|
for term in capitalized_terms:
|
|
if len(term) > 2 and term.lower() not in stop_words:
|
|
terms_in_post[term] += 1
|
|
|
|
# Strategy 2: Technical terms in quotes or emphasized
|
|
quoted_terms = re.findall(r'["\']([^"\']{3,30})["\']', clean_content)
|
|
for term in quoted_terms:
|
|
if not term.lower() in stop_words and len(term.split()) <= 3:
|
|
terms_in_post[term] += 1
|
|
|
|
# Strategy 3: Acronyms and technical terms
|
|
acronyms = re.findall(r'\b[A-Z]{2,8}\b', clean_content)
|
|
for term in acronyms:
|
|
if term not in ['THE', 'AND', 'FOR', 'BUT', 'NOT']:
|
|
terms_in_post[term] += 2 # Weight acronyms higher
|
|
|
|
# Strategy 4: Important technical words
|
|
words = re.findall(r'\b\w{4,}\b', clean_content.lower())
|
|
for word in words:
|
|
if word in important_terms or word.lower() in important_terms:
|
|
terms_in_post[word] += 1
|
|
|
|
# Strategy 5: Multi-word technical phrases
|
|
tech_phrases = [
|
|
'artificial intelligence', 'machine learning', 'open source', 'user experience',
|
|
'mental health', 'spiritual practice', 'human consciousness', 'digital mind',
|
|
'for humans', 'API design', 'software development', 'programming language'
|
|
]
|
|
for phrase in tech_phrases:
|
|
if phrase.lower() in clean_content.lower():
|
|
terms_in_post[phrase] += 2
|
|
|
|
# Add significant terms to the global index
|
|
for term, count in terms_in_post.items():
|
|
if count >= 1: # Must appear at least once
|
|
term_occurrences[term].append({
|
|
'title': post['title'],
|
|
'url': post['url'],
|
|
'count': count
|
|
})
|
|
|
|
# Filter and organize terms
|
|
final_terms = {}
|
|
for term, occurrences in term_occurrences.items():
|
|
# Only include terms that appear in multiple articles OR appear frequently in one
|
|
total_occurrences = sum(occ['count'] for occ in occurrences)
|
|
if len(occurrences) >= 2 or total_occurrences >= 3:
|
|
# Sort articles by term frequency within each article
|
|
occurrences.sort(key=lambda x: x['count'], reverse=True)
|
|
final_terms[term] = {
|
|
'articles': occurrences,
|
|
'total_count': total_occurrences,
|
|
'article_count': len(occurrences)
|
|
}
|
|
|
|
# Sort terms alphabetically
|
|
sorted_terms = dict(sorted(final_terms.items(), key=lambda x: x[0].lower()))
|
|
|
|
result = {
|
|
'terms': sorted_terms,
|
|
'total_terms': len(sorted_terms),
|
|
'total_occurrences': sum(term_data['total_count'] for term_data in sorted_terms.values())
|
|
}
|
|
|
|
# Cache the result
|
|
_terms_cache['data'] = result
|
|
|
|
return result
|
|
|
|
|
|
def preload_terms():
|
|
"""Preload terms cache at startup for faster initial page loads."""
|
|
print("Preloading terms cache...")
|
|
start_time = time.time()
|
|
terms_data = _extract_all_terms_cached()
|
|
load_time = time.time() - start_time
|
|
print(f"Extracted {terms_data['total_terms']} terms with {terms_data['total_occurrences']} total occurrences in {load_time:.2f}s")
|
|
|
|
|
|
def find_related_posts(current_post_path, limit=3):
|
|
"""Find related posts based on category and content similarity."""
|
|
posts = collect_all_blog_posts()
|
|
current_post_url = '/' + current_post_path[:-3] if current_post_path.endswith('.md') else '/' + current_post_path
|
|
|
|
# Find current post
|
|
current_post = None
|
|
for post in posts:
|
|
if post['url'] == current_post_url:
|
|
current_post = post
|
|
break
|
|
|
|
if not current_post:
|
|
return []
|
|
|
|
# Score related posts
|
|
related_posts = []
|
|
for post in posts:
|
|
if post['url'] == current_post_url:
|
|
continue # Skip current post
|
|
|
|
score = 0
|
|
|
|
# Category match gets high score
|
|
if post['category'] == current_post['category']:
|
|
score += 10
|
|
|
|
# Check for common words in titles (simple text similarity)
|
|
current_title_words = set(current_post['title'].lower().split())
|
|
post_title_words = set(post['title'].lower().split())
|
|
common_title_words = current_title_words.intersection(post_title_words)
|
|
score += len(common_title_words) * 2
|
|
|
|
# Check for common words in descriptions
|
|
current_desc_words = set(current_post['description'].lower().split()) if current_post['description'] else set()
|
|
post_desc_words = set(post['description'].lower().split()) if post['description'] else set()
|
|
common_desc_words = current_desc_words.intersection(post_desc_words)
|
|
score += len(common_desc_words) * 0.5
|
|
|
|
# Prefer more recent posts (slight boost)
|
|
days_diff = abs((current_post['pub_date'] - post['pub_date']).days)
|
|
if days_diff < 365: # Posts within a year get a small boost
|
|
score += max(0, (365 - days_diff) / 365)
|
|
|
|
if score > 0:
|
|
related_posts.append((post, score))
|
|
|
|
# Sort by score and return top N
|
|
related_posts.sort(key=lambda x: x[1], reverse=True)
|
|
return [post for post, score in related_posts[:limit]]
|
|
|
|
|
|
def find_adjacent_posts(current_post_path):
|
|
"""Find next and previous posts chronologically."""
|
|
posts = collect_all_blog_posts()
|
|
current_post_url = '/' + current_post_path[:-3] if current_post_path.endswith('.md') else '/' + current_post_path
|
|
|
|
# Find current post index
|
|
current_index = None
|
|
for i, post in enumerate(posts):
|
|
if post['url'] == current_post_url:
|
|
current_index = i
|
|
break
|
|
|
|
if current_index is None:
|
|
return None, None
|
|
|
|
# Get previous (newer) and next (older) posts
|
|
prev_post = posts[current_index - 1] if current_index > 0 else None
|
|
next_post = posts[current_index + 1] if current_index < len(posts) - 1 else None
|
|
|
|
return prev_post, next_post
|
|
|
|
|
|
def generate_sitemap_data():
|
|
"""Generate sitemap data by recursively scanning the data directory."""
|
|
sitemap_items = []
|
|
|
|
def scan_directory(path, url_path=""):
|
|
if not path.exists() or not path.is_dir():
|
|
return
|
|
|
|
for item in sorted(path.iterdir()):
|
|
if item.name.startswith('.'):
|
|
continue
|
|
|
|
item_url_path = f"{url_path}/{item.name}" if url_path else item.name
|
|
|
|
if item.is_dir():
|
|
# Add directory to sitemap
|
|
sitemap_items.append({
|
|
'url': f"/{item_url_path}",
|
|
'title': item.name.replace('-', ' ').replace('_', ' ').title(),
|
|
'type': 'directory',
|
|
'modified': datetime.fromtimestamp(item.stat().st_mtime)
|
|
})
|
|
# Recursively scan subdirectories
|
|
scan_directory(item, item_url_path)
|
|
elif item.suffix == '.md':
|
|
# Remove .md extension for clean URLs
|
|
clean_url_path = item_url_path[:-3] if item_url_path.endswith('.md') else item_url_path
|
|
|
|
# Get title from file content
|
|
title = item.stem.replace('-', ' ').replace('_', ' ').title()
|
|
try:
|
|
content_data = render_markdown_file(item)
|
|
title = content_data['title']
|
|
except:
|
|
pass
|
|
|
|
sitemap_items.append({
|
|
'url': f"/{clean_url_path}",
|
|
'title': title,
|
|
'type': 'article',
|
|
'modified': datetime.fromtimestamp(item.stat().st_mtime)
|
|
})
|
|
|
|
# Start scanning from data directory
|
|
scan_directory(DATA_DIR)
|
|
|
|
# Add static pages
|
|
static_pages = [
|
|
{'url': '/', 'title': 'Kenneth Reitz - Digital Mind Map', 'type': 'homepage'},
|
|
{'url': '/directory', 'title': 'File Explorer', 'type': 'directory'},
|
|
{'url': '/sitemap', 'title': 'Site Map', 'type': 'sitemap'}
|
|
]
|
|
|
|
return static_pages + sitemap_items
|
|
|
|
@app.route('/sitemap')
|
|
def sitemap():
|
|
"""Show the site sitemap."""
|
|
sitemap_data = generate_sitemap_data()
|
|
|
|
# Group by type
|
|
grouped_sitemap = {
|
|
'homepage': [],
|
|
'directory': [],
|
|
'article': [],
|
|
'sitemap': []
|
|
}
|
|
|
|
for item in sitemap_data:
|
|
item_type = item.get('type', 'article')
|
|
if item_type in grouped_sitemap:
|
|
grouped_sitemap[item_type].append(item)
|
|
|
|
return render_template('sitemap.html',
|
|
title='Site Map',
|
|
sitemap_data=grouped_sitemap,
|
|
total_items=len(sitemap_data),
|
|
breadcrumbs=[],
|
|
current_year=datetime.now().year,
|
|
current_page='Site Map')
|
|
|
|
@app.route('/sitemap.xml')
|
|
def sitemap_xml():
|
|
"""Generate XML sitemap for search engines."""
|
|
sitemap_data = generate_sitemap_data()
|
|
|
|
xml_content = '<?xml version="1.0" encoding="UTF-8"?>\n'
|
|
xml_content += '<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">\n'
|
|
|
|
for item in sitemap_data:
|
|
xml_content += ' <url>\n'
|
|
xml_content += f' <loc>https://kennethreitz.org{escape(item["url"])}</loc>\n'
|
|
if 'modified' in item:
|
|
xml_content += f' <lastmod>{item["modified"].strftime("%Y-%m-%d")}</lastmod>\n'
|
|
xml_content += ' </url>\n'
|
|
|
|
xml_content += '</urlset>'
|
|
|
|
return Response(xml_content, mimetype='application/xml')
|
|
|
|
|
|
@app.route('/feed.xml')
|
|
@app.route('/rss.xml')
|
|
def rss_feed():
|
|
"""Generate RSS feed with full article content."""
|
|
posts = collect_all_blog_posts() # Use all posts like the archive page
|
|
|
|
# Generate RSS XML with full content
|
|
rss_content = '<?xml version="1.0" encoding="UTF-8"?>\n'
|
|
rss_content += '<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom" xmlns:content="http://purl.org/rss/1.0/modules/content/">\n'
|
|
rss_content += ' <channel>\n'
|
|
rss_content += ' <title>Kenneth Reitz - Essays & AI Writings</title>\n'
|
|
rss_content += ' <description>Complete archive with full articles - Essays, AI consciousness research, and philosophical explorations</description>\n'
|
|
rss_content += ' <link>https://kennethreitz.org</link>\n'
|
|
rss_content += ' <atom:link href="https://kennethreitz.org/feed.xml" rel="self" type="application/rss+xml" />\n'
|
|
rss_content += f' <lastBuildDate>{datetime.now().strftime("%a, %d %b %Y %H:%M:%S GMT")}</lastBuildDate>\n'
|
|
rss_content += ' <language>en-us</language>\n'
|
|
rss_content += ' <managingEditor>me@kennethreitz.org (Kenneth Reitz)</managingEditor>\n'
|
|
rss_content += ' <webMaster>me@kennethreitz.org (Kenneth Reitz)</webMaster>\n'
|
|
|
|
for post in posts:
|
|
# Get full content for this post by re-reading the file
|
|
try:
|
|
# Build the file path - post['url'] is like '/essays/2025-09-01-something'
|
|
relative_path = post['url'][1:] # Remove leading /
|
|
file_path = DATA_DIR / (relative_path + '.md')
|
|
|
|
if file_path.exists():
|
|
full_content_data = render_markdown_file(file_path)
|
|
full_content = full_content_data['content']
|
|
else:
|
|
# Fallback to stored content (truncated)
|
|
full_content = post.get('content', post['description'])
|
|
except Exception as e:
|
|
# Debug: use description with error info
|
|
full_content = f"{post['description']} <!-- Error loading full content: {str(e)} -->"
|
|
|
|
rss_content += ' <item>\n'
|
|
rss_content += f' <title>{escape(post["title"])}</title>\n'
|
|
rss_content += f' <link>https://kennethreitz.org{post["url"]}</link>\n'
|
|
rss_content += f' <description>{escape(post["description"])}</description>\n'
|
|
rss_content += f' <content:encoded><![CDATA[{full_content}]]></content:encoded>\n'
|
|
rss_content += f' <category>{escape(post["category"])}</category>\n'
|
|
rss_content += f' <pubDate>{post["pub_date"].strftime("%a, %d %b %Y %H:%M:%S GMT")}</pubDate>\n'
|
|
rss_content += f' <guid>https://kennethreitz.org{post["url"]}</guid>\n'
|
|
rss_content += ' </item>\n'
|
|
|
|
rss_content += ' </channel>\n'
|
|
rss_content += '</rss>'
|
|
|
|
return Response(rss_content, mimetype='application/rss+xml')
|
|
|
|
|
|
|
|
# Preload caches concurrently for faster startup (works with both direct run and Gunicorn)
|
|
import concurrent.futures
|
|
import threading
|
|
|
|
def preload_all_caches():
|
|
"""Run all cache preloading functions sequentially to reduce memory usage."""
|
|
print("Starting background cache preloading...")
|
|
|
|
preload_functions = [
|
|
("blog posts", preload_blog_posts),
|
|
("sidenotes", preload_sidenotes),
|
|
("outlines", preload_outlines),
|
|
("quotes", preload_quotes),
|
|
("connections", preload_connections),
|
|
("terms", preload_terms)
|
|
]
|
|
|
|
for name, func in preload_functions:
|
|
try:
|
|
func()
|
|
except Exception as e:
|
|
print(f"Error preloading {name}: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
|
|
print("Background cache preloading completed!")
|
|
|
|
def start_background_preload():
|
|
"""Start cache preloading in a background daemon thread."""
|
|
cache_thread = threading.Thread(target=preload_all_caches, daemon=True)
|
|
cache_thread.start()
|
|
print("Cache preloading started in background. App ready to serve requests!")
|
|
|
|
# Only start background preloading once, not in every Gunicorn worker
|
|
# Use a lock file to ensure only one process does the preloading
|
|
import os
|
|
import fcntl
|
|
import atexit
|
|
|
|
cache_lock_file = None
|
|
|
|
def should_preload_caches():
|
|
"""Check if this process should handle cache preloading."""
|
|
global cache_lock_file
|
|
|
|
# Skip preloading since we already initialized unified cache
|
|
print("Skipping runtime preload - unified cache already loaded!")
|
|
return False
|
|
|
|
# Default to preloading (better for reliability and single-container deployments)
|
|
# Only skip if we explicitly can't get the lock
|
|
try:
|
|
# Create a lock file in app directory (more reliable than /tmp in Docker)
|
|
lock_path = '.cache_preload.lock'
|
|
cache_lock_file = open(lock_path, 'w')
|
|
# Try to acquire exclusive lock (non-blocking)
|
|
fcntl.lockf(cache_lock_file, fcntl.LOCK_EX | fcntl.LOCK_NB)
|
|
# If we got here, we got the lock - we should preload
|
|
|
|
# Clean up lock on exit
|
|
def cleanup_lock():
|
|
if cache_lock_file:
|
|
cache_lock_file.close()
|
|
try:
|
|
os.unlink(lock_path)
|
|
except:
|
|
pass
|
|
atexit.register(cleanup_lock)
|
|
return True
|
|
except (IOError, OSError):
|
|
# Lock is already held by another process - skip preloading
|
|
if cache_lock_file:
|
|
cache_lock_file.close()
|
|
return False
|
|
|
|
# Initialize unified cache at startup (after all functions are defined)
|
|
initialize_unified_cache()
|
|
|
|
# Start background preloading only in one process (and only if needed)
|
|
if should_preload_caches():
|
|
start_background_preload()
|
|
|
|
if __name__ == '__main__':
|
|
app.run(debug=True, host='0.0.0.0', port=8000)
|