mirror of
https://github.com/kennethreitz/kennethreitz.org.git
synced 2026-06-05 22:50:17 +00:00
99 lines
3.1 KiB
Python
99 lines
3.1 KiB
Python
import os
|
|
import re
|
|
import xml.etree.ElementTree as ET
|
|
from datetime import datetime
|
|
import markdownify
|
|
import pathlib
|
|
import unicodedata
|
|
|
|
|
|
def slugify(value, allow_unicode=False):
|
|
"""
|
|
Convert a string to a valid filename, ensuring no unintended directories are created.
|
|
"""
|
|
value = str(value)
|
|
if allow_unicode:
|
|
value = unicodedata.normalize("NFKC", value)
|
|
else:
|
|
value = (
|
|
unicodedata.normalize("NFKD", value)
|
|
.encode("ascii", "ignore")
|
|
.decode("ascii")
|
|
)
|
|
value = re.sub(r"[^\w\s-]", "", value.lower())
|
|
return re.sub(r"[-\s]+", "_", value).strip("-_")
|
|
|
|
|
|
def extract_text_from_cdata(cdata):
|
|
"""
|
|
Extracts text content from CDATA and cleans it up for markdown.
|
|
"""
|
|
html_content = re.sub(r"\s+", " ", cdata)
|
|
return markdownify.markdownify(html_content, heading_style="ATX")
|
|
|
|
|
|
def save_post_as_markdown(title, content, pub_date, destination_dir):
|
|
"""
|
|
Saves the blog post as a markdown file organized by year with the original publication date as the file's creation time.
|
|
"""
|
|
# Determine the year and create a directory for it
|
|
year = pub_date.year
|
|
year_dir = destination_dir / str(year)
|
|
year_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Generate a safe filename
|
|
filename = f"{slugify(title)}.md"
|
|
file_path = year_dir / filename
|
|
|
|
# Write the content to the file
|
|
with open(file_path, "w") as file:
|
|
file.write(f"# {title}\n\n")
|
|
file.write(content)
|
|
|
|
# Set the file's creation and modification time to the original publication date
|
|
pub_timestamp = pub_date.timestamp()
|
|
os.utime(file_path, (pub_timestamp, pub_timestamp))
|
|
|
|
|
|
def parse_xml_and_save_posts(xml_file, destination_dir):
|
|
"""
|
|
Parses the provided XML file, extracts blog posts, and saves them to the destination directory organized by year.
|
|
"""
|
|
tree = ET.parse(xml_file)
|
|
root = tree.getroot()
|
|
|
|
destination_dir = pathlib.Path(destination_dir)
|
|
destination_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Define namespaces to handle XML namespaces in the file
|
|
namespaces = {
|
|
"content": "http://purl.org/rss/1.0/modules/content/",
|
|
"wp": "http://wordpress.org/export/1.2/",
|
|
}
|
|
|
|
for item in root.findall(".//item"):
|
|
title = item.find("title").text
|
|
pub_date_str = item.find("pubDate").text
|
|
pub_date = datetime.strptime(pub_date_str, "%a, %d %b %Y %H:%M:%S %z")
|
|
content = item.find("content:encoded", namespaces).text
|
|
|
|
# Convert HTML content to Markdown
|
|
markdown_content = extract_text_from_cdata(content)
|
|
|
|
# Save the post as a markdown file organized by year
|
|
save_post_as_markdown(title, markdown_content, pub_date, destination_dir)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
import argparse
|
|
|
|
parser = argparse.ArgumentParser(
|
|
description="Convert XML blog export to Markdown files organized by year."
|
|
)
|
|
parser.add_argument("xml_file", help="Path to the XML file to be processed.")
|
|
parser.add_argument("destination_dir", help="Directory to save the Markdown files.")
|
|
|
|
args = parser.parse_args()
|
|
|
|
parse_xml_and_save_posts(args.xml_file, args.destination_dir)
|