kennethreitz.org/scripts/convert_squarespace.py

import os
import re
import xml.etree.ElementTree as ET
from datetime import datetime
import markdownify
import pathlib
import unicodedata


def slugify(value, allow_unicode=False):
    """
    Convert a string to a valid filename, ensuring no unintended directories are created.
    """
    value = str(value)
    if allow_unicode:
        value = unicodedata.normalize("NFKC", value)
    else:
        value = (
            unicodedata.normalize("NFKD", value)
            .encode("ascii", "ignore")
            .decode("ascii")
        )
    value = re.sub(r"[^\w\s-]", "", value.lower())
    return re.sub(r"[-\s]+", "_", value).strip("-_")


def extract_text_from_cdata(cdata):
    """
    Extracts text content from CDATA and cleans it up for markdown.
    """
    html_content = re.sub(r"\s+", " ", cdata)
    return markdownify.markdownify(html_content, heading_style="ATX")


def save_post_as_markdown(title, content, pub_date, destination_dir):
    """
    Saves the blog post as a markdown file organized by year with the original publication date as the file's creation time.
    """
    # Determine the year and create a directory for it
    year = pub_date.year
    year_dir = destination_dir / str(year)
    year_dir.mkdir(parents=True, exist_ok=True)

    # Generate a safe filename
    filename = f"{slugify(title)}.md"
    file_path = year_dir / filename

    # Write the content to the file
    with open(file_path, "w") as file:
        file.write(f"# {title}\n\n")
        file.write(content)

    # Set the file's creation and modification time to the original publication date
    pub_timestamp = pub_date.timestamp()
    os.utime(file_path, (pub_timestamp, pub_timestamp))


def parse_xml_and_save_posts(xml_file, destination_dir):
    """
    Parses the provided XML file, extracts blog posts, and saves them to the destination directory organized by year.
    """
    tree = ET.parse(xml_file)
    root = tree.getroot()

    destination_dir = pathlib.Path(destination_dir)
    destination_dir.mkdir(parents=True, exist_ok=True)

    # Define namespaces to handle XML namespaces in the file
    namespaces = {
        "content": "http://purl.org/rss/1.0/modules/content/",
        "wp": "http://wordpress.org/export/1.2/",
    }

    for item in root.findall(".//item"):
        title = item.find("title").text
        pub_date_str = item.find("pubDate").text
        pub_date = datetime.strptime(pub_date_str, "%a, %d %b %Y %H:%M:%S %z")
        content = item.find("content:encoded", namespaces).text

        # Convert HTML content to Markdown
        markdown_content = extract_text_from_cdata(content)

        # Save the post as a markdown file organized by year
        save_post_as_markdown(title, markdown_content, pub_date, destination_dir)


if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser(
        description="Convert XML blog export to Markdown files organized by year."
    )
    parser.add_argument("xml_file", help="Path to the XML file to be processed.")
    parser.add_argument("destination_dir", help="Directory to save the Markdown files.")

    args = parser.parse_args()

    parse_xml_and_save_posts(args.xml_file, args.destination_dir)