Files
2024-08-15 18:24:40 -04:00

99 lines
3.1 KiB
Python

import os
import re
import xml.etree.ElementTree as ET
from datetime import datetime
import markdownify
import pathlib
import unicodedata
def slugify(value, allow_unicode=False):
"""
Convert a string to a valid filename, ensuring no unintended directories are created.
"""
value = str(value)
if allow_unicode:
value = unicodedata.normalize("NFKC", value)
else:
value = (
unicodedata.normalize("NFKD", value)
.encode("ascii", "ignore")
.decode("ascii")
)
value = re.sub(r"[^\w\s-]", "", value.lower())
return re.sub(r"[-\s]+", "_", value).strip("-_")
def extract_text_from_cdata(cdata):
"""
Extracts text content from CDATA and cleans it up for markdown.
"""
html_content = re.sub(r"\s+", " ", cdata)
return markdownify.markdownify(html_content, heading_style="ATX")
def save_post_as_markdown(title, content, pub_date, destination_dir):
"""
Saves the blog post as a markdown file organized by year with the original publication date as the file's creation time.
"""
# Determine the year and create a directory for it
year = pub_date.year
year_dir = destination_dir / str(year)
year_dir.mkdir(parents=True, exist_ok=True)
# Generate a safe filename
filename = f"{slugify(title)}.md"
file_path = year_dir / filename
# Write the content to the file
with open(file_path, "w") as file:
file.write(f"# {title}\n\n")
file.write(content)
# Set the file's creation and modification time to the original publication date
pub_timestamp = pub_date.timestamp()
os.utime(file_path, (pub_timestamp, pub_timestamp))
def parse_xml_and_save_posts(xml_file, destination_dir):
"""
Parses the provided XML file, extracts blog posts, and saves them to the destination directory organized by year.
"""
tree = ET.parse(xml_file)
root = tree.getroot()
destination_dir = pathlib.Path(destination_dir)
destination_dir.mkdir(parents=True, exist_ok=True)
# Define namespaces to handle XML namespaces in the file
namespaces = {
"content": "http://purl.org/rss/1.0/modules/content/",
"wp": "http://wordpress.org/export/1.2/",
}
for item in root.findall(".//item"):
title = item.find("title").text
pub_date_str = item.find("pubDate").text
pub_date = datetime.strptime(pub_date_str, "%a, %d %b %Y %H:%M:%S %z")
content = item.find("content:encoded", namespaces).text
# Convert HTML content to Markdown
markdown_content = extract_text_from_cdata(content)
# Save the post as a markdown file organized by year
save_post_as_markdown(title, markdown_content, pub_date, destination_dir)
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(
description="Convert XML blog export to Markdown files organized by year."
)
parser.add_argument("xml_file", help="Path to the XML file to be processed.")
parser.add_argument("destination_dir", help="Directory to save the Markdown files.")
args = parser.parse_args()
parse_xml_and_save_posts(args.xml_file, args.destination_dir)