letters/love_letters.py

#!/usr/bin/env python3
"""
Love Letters — Display random historic love letters from Project Gutenberg.

Sources:
  • Henry VIII to Anne Boleyn (c. 1527–1528)
  • Mary Wollstonecraft to Gilbert Imlay (1793–1795)
  • Letters of Abelard and Heloise (12th century)
  • Napoleon Bonaparte to Josephine (1796–1812)
  • John Keats to Fanny Brawne (1819–1820)
"""

import json
import os
import random
import re
import sys
import textwrap
import urllib.request

CACHE_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), ".letter_cache")

SOURCES = [
    {
        "id": "henry_viii",
        "title": "The Love Letters of Henry VIII to Anne Boleyn",
        "author": "Henry VIII",
        "recipient": "Anne Boleyn",
        "year": "c. 1527–1528",
        "url": "https://www.gutenberg.org/cache/epub/32155/pg32155.txt",
        "gutenberg_id": 32155,
    },
    {
        "id": "wollstonecraft",
        "title": "The Love Letters of Mary Wollstonecraft to Gilbert Imlay",
        "author": "Mary Wollstonecraft",
        "recipient": "Gilbert Imlay",
        "year": "1793–1795",
        "url": "https://www.gutenberg.org/cache/epub/34413/pg34413.txt",
        "gutenberg_id": 34413,
    },
    {
        "id": "abelard_heloise",
        "title": "Letters of Abelard and Heloise",
        "author": "Abelard & Heloise",
        "recipient": "each other",
        "year": "12th century",
        "url": "https://www.gutenberg.org/cache/epub/35977/pg35977.txt",
        "gutenberg_id": 35977,
    },
    {
        "id": "napoleon",
        "title": "Napoleon's Letters to Josephine",
        "author": "Napoleon Bonaparte",
        "recipient": "Josephine",
        "year": "1796–1812",
        "url": "https://www.gutenberg.org/cache/epub/37499/pg37499.txt",
        "gutenberg_id": 37499,
    },
    {
        "id": "keats_brawne",
        "title": "Letters of John Keats to Fanny Brawne",
        "author": "John Keats",
        "recipient": "Fanny Brawne",
        "year": "1819–1820",
        "url": "https://www.gutenberg.org/cache/epub/60433/pg60433.txt",
        "gutenberg_id": 60433,
    },
]

SEPARATOR = "─" * 60


def download_text(url: str) -> str:
    """Download a plain-text file from Project Gutenberg."""
    req = urllib.request.Request(url, headers={"User-Agent": "LoveLettersApp/1.0"})
    with urllib.request.urlopen(req, timeout=30) as resp:
        return resp.read().decode("utf-8", errors="replace")


def strip_gutenberg_header_footer(text: str) -> str:
    """Remove the Project Gutenberg header and footer boilerplate."""
    start_markers = [
        "*** START OF THE PROJECT GUTENBERG EBOOK",
        "*** START OF THIS PROJECT GUTENBERG EBOOK",
        "***START OF THE PROJECT GUTENBERG EBOOK",
    ]
    end_markers = [
        "*** END OF THE PROJECT GUTENBERG EBOOK",
        "*** END OF THIS PROJECT GUTENBERG EBOOK",
        "***END OF THE PROJECT GUTENBERG EBOOK",
        "End of the Project Gutenberg EBook",
        "End of Project Gutenberg",
    ]

    for marker in start_markers:
        idx = text.find(marker)
        if idx != -1:
            text = text[idx + len(marker) :]
            nl = text.find("\n")
            if nl != -1:
                text = text[nl + 1 :]
            break

    for marker in end_markers:
        idx = text.find(marker)
        if idx != -1:
            text = text[:idx]
            break

    return text.strip()


# ---------------------------------------------------------------------------
# Per-source letter extraction
# ---------------------------------------------------------------------------

def extract_henry_viii(text: str) -> list[dict]:
    """Extract individual letters from the Henry VIII collection."""
    text = strip_gutenberg_header_footer(text)
    text = text.replace("\r\n", "\n")
    # Letters use written-out ordinals: "Letter First", "Letter Second", etc.
    parts = re.split(
        r"\n{2,}(?=Letter\s+(?:First|Second|Third|Fourth|Fifth|Sixth|Seventh|"
        r"Eighth|Ninth|Tenth|Eleventh|Twelfth|Thirteenth|Fourteenth|"
        r"Fifteenth|Sixteenth|Seventeenth|Eighteenth)\b)",
        text,
    )
    letters = []
    for part in parts:
        part = part.strip()
        if not part or len(part) < 80:
            continue
        m = re.match(r"(Letter\s+\w+)(?:\s+.*?)?\n", part, re.IGNORECASE)
        if not m:
            continue
        heading = m.group(1)
        body = part[m.end():].strip()
        # Remove notes section at the end
        notes_idx = body.find("\nNotes\n")
        if notes_idx == -1:
            notes_idx = body.find("\nNOTES\n")
        if notes_idx != -1:
            body = body[:notes_idx].strip()
        author = "Henry VIII"
        recipient = "Anne Boleyn"
        if "Anne Boleyn to Wolsey" in part[:200] or "Boleyn to" in part[:200]:
            author = "Anne Boleyn"
            recipient = "Cardinal Wolsey"
        if len(body) > 50:
            letters.append({
                "heading": heading,
                "body": body,
                "author": author,
                "recipient": recipient,
                "source": "The Love Letters of Henry VIII to Anne Boleyn",
                "period": "c. 1527–1528",
            })
    return letters


def extract_wollstonecraft(text: str) -> list[dict]:
    """Extract individual letters from the Wollstonecraft collection."""
    text = strip_gutenberg_header_footer(text)
    text = text.replace("\r\n", "\n")
    parts = re.split(r"\n{2,}(?=LETTER\s+[IVXLC0-9]+\.?\s*\n)", text, flags=re.IGNORECASE)
    letters = []
    for part in parts:
        part = part.strip()
        if not part or len(part) < 80:
            continue
        m = re.match(r"(LETTER\s+[IVXLC0-9]+\.?)\s*\n", part, re.IGNORECASE)
        heading = m.group(1) if m else ""
        body = part[m.end():].strip() if m else part
        if len(body) > 50:
            letters.append({
                "heading": heading,
                "body": body,
                "author": "Mary Wollstonecraft",
                "recipient": "Gilbert Imlay",
                "source": "The Love Letters of Mary Wollstonecraft to Gilbert Imlay",
                "period": "1793–1795",
            })
    return letters


def extract_abelard_heloise(text: str) -> list[dict]:
    """Extract individual letters from the Abelard & Heloise collection."""
    text = strip_gutenberg_header_footer(text)
    text = text.replace("\r\n", "\n")
    parts = re.split(r"\n{2,}(?=LETTER\s+[IVXLC0-9]+[.:]?\s*\n)", text, flags=re.IGNORECASE)
    letters = []
    for part in parts:
        part = part.strip()
        if not part or len(part) < 120:
            continue
        m = re.match(r"(LETTER\s+[IVXLC0-9]+[.:]?)\s*\n", part, re.IGNORECASE)
        if not m:
            continue
        heading = m.group(1)
        body = part[m.end():].strip()
        author = "Abelard & Heloise"
        recipient = "each other"
        lower = body[:300].lower()
        if "heloise to abelard" in lower:
            author = "Heloise"
            recipient = "Abelard"
        elif "abelard to heloise" in lower:
            author = "Abelard"
            recipient = "Heloise"
        if len(body) > 50:
            letters.append({
                "heading": heading,
                "body": body,
                "author": author,
                "recipient": recipient,
                "source": "Letters of Abelard and Heloise",
                "period": "12th century",
            })
    return letters


def extract_napoleon(text: str) -> list[dict]:
    """Extract individual letters from Napoleon's letters to Josephine."""
    text = strip_gutenberg_header_footer(text)
    text = text.replace("\r\n", "\n")
    # Letters are headed "No. 1.", "No. 2.", etc. on their own line
    parts = re.split(r"\n{2,}(?=No\.\s*\d+\.\s*\n)", text)
    letters = []
    for part in parts:
        part = part.strip()
        if not part or len(part) < 100:
            continue
        m = re.match(r"(No\.\s*\d+\.)\s*\n", part)
        if not m:
            continue
        heading = m.group(1)
        body = part[m.end():].strip()
        # Skip table of contents entries (short lines with page numbers)
        if len(body) < 80:
            continue
        letters.append({
            "heading": heading,
            "body": body,
            "author": "Napoleon Bonaparte",
            "recipient": "Josephine",
            "source": "Napoleon's Letters to Josephine, 1796–1812",
            "period": "1796–1812",
        })
    return letters


def extract_keats_brawne(text: str) -> list[dict]:
    """Extract individual letters from Keats to Fanny Brawne."""
    text = strip_gutenberg_header_footer(text)
    text = text.replace("\r\n", "\n")
    # Letters are numbered with Roman numerals on their own line: "I.", "II.", etc.
    parts = re.split(r"\n{2,}(?=[IVXLC]+\.\s*\n)", text)
    letters = []
    for part in parts:
        part = part.strip()
        if not part or len(part) < 100:
            continue
        m = re.match(r"([IVXLC]+)\.\s*\n", part)
        if not m:
            continue
        heading = f"Letter {m.group(1)}"
        body = part[m.end():].strip()
        # Remove editorial footnotes in brackets
        if len(body) > 50:
            letters.append({
                "heading": heading,
                "body": body,
                "author": "John Keats",
                "recipient": "Fanny Brawne",
                "source": "Letters of John Keats to Fanny Brawne",
                "period": "1819–1820",
            })
    return letters


EXTRACTORS = {
    "henry_viii": extract_henry_viii,
    "wollstonecraft": extract_wollstonecraft,
    "abelard_heloise": extract_abelard_heloise,
    "napoleon": extract_napoleon,
    "keats_brawne": extract_keats_brawne,
}


# ---------------------------------------------------------------------------
# Caching
# ---------------------------------------------------------------------------

def get_cache_path(source_id: str) -> str:
    return os.path.join(CACHE_DIR, f"{source_id}.json")


def load_cached_letters(source_id: str) -> list[dict] | None:
    path = get_cache_path(source_id)
    if os.path.exists(path):
        with open(path, "r", encoding="utf-8") as f:
            return json.load(f)
    return None


def save_cached_letters(source_id: str, letters: list[dict]) -> None:
    os.makedirs(CACHE_DIR, exist_ok=True)
    with open(get_cache_path(source_id), "w", encoding="utf-8") as f:
        json.dump(letters, f, ensure_ascii=False, indent=2)


# ---------------------------------------------------------------------------
# Main logic
# ---------------------------------------------------------------------------

def fetch_and_parse(source: dict) -> list[dict]:
    """Download, extract, and cache letters for a given source."""
    cached = load_cached_letters(source["id"])
    if cached is not None:
        return cached

    print(f"  Downloading: {source['title']}…", flush=True)
    try:
        raw = download_text(source["url"])
    except Exception as e:
        print(f"  ⚠  Failed to download {source['title']}: {e}")
        return []

    extractor = EXTRACTORS.get(source["id"])
    if extractor is None:
        return []

    letters = extractor(raw)
    if letters:
        save_cached_letters(source["id"], letters)
    return letters


def load_all_letters() -> list[dict]:
    """Load letters from all sources, downloading as needed."""
    all_letters: list[dict] = []
    for source in SOURCES:
        letters = fetch_and_parse(source)
        all_letters.extend(letters)
    return all_letters


def wrap_text(text: str, width: int = 78) -> str:
    """Word-wrap text while preserving paragraph breaks."""
    paragraphs = re.split(r"\n\s*\n", text)
    wrapped = []
    for para in paragraphs:
        para = " ".join(para.split())
        wrapped.append(textwrap.fill(para, width=width))
    return "\n\n".join(wrapped)


def truncate_letter(body: str, max_chars: int = 3000) -> str:
    """Truncate very long letters with an ellipsis note."""
    if len(body) <= max_chars:
        return body
    truncated = body[:max_chars]
    last_period = truncated.rfind(".")
    if last_period > max_chars // 2:
        truncated = truncated[: last_period + 1]
    return truncated + "\n\n  […letter continues…]"


def display_letter(letter: dict) -> None:
    """Pretty-print a single love letter to the terminal."""
    print()
    print(SEPARATOR)
    print(f"  ✉  {letter['author']}  →  {letter['recipient']}")
    if letter.get("heading"):
        print(f"     {letter['heading']}")
    print(f"     ({letter['period']})")
    print(SEPARATOR)
    print()

    body = truncate_letter(letter["body"])
    print(wrap_text(body))

    print()
    print(SEPARATOR)
    print(f"  Source: {letter['source']}")
    print(f"  Via Project Gutenberg  •  gutenberg.org")
    print(SEPARATOR)
    print()


def list_sources() -> None:
    """Print available letter collections."""
    print("\n  Available collections:\n")
    for i, src in enumerate(SOURCES, 1):
        print(f"  {i}. {src['title']}")
        print(f"     {src['author']} → {src['recipient']} ({src['year']})")
        print(f"     gutenberg.org/ebooks/{src['gutenberg_id']}")
        print()


def main() -> None:
    import argparse

    parser = argparse.ArgumentParser(
        description="Display random historic love letters from Project Gutenberg.",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog=textwrap.dedent("""\
            examples:
              %(prog)s              Show a random love letter
              %(prog)s -n 3         Show 3 random love letters
              %(prog)s --list       List available collections
              %(prog)s --refresh    Re-download all sources
        """),
    )
    parser.add_argument(
        "-n", "--count", type=int, default=1, metavar="N",
        help="number of letters to display (default: 1)",
    )
    parser.add_argument(
        "--list", action="store_true",
        help="list available letter collections",
    )
    parser.add_argument(
        "--refresh", action="store_true",
        help="clear cache and re-download all sources",
    )
    parser.add_argument(
        "--source", type=str, metavar="ID",
        choices=[s["id"] for s in SOURCES],
        help="only show letters from a specific source",
    )

    args = parser.parse_args()

    if args.list:
        list_sources()
        return

    if args.refresh:
        import shutil
        if os.path.isdir(CACHE_DIR):
            shutil.rmtree(CACHE_DIR)
        print("  Cache cleared.")

    print("\n  💌 Love Letters — loading collections…\n")
    all_letters = load_all_letters()

    if not all_letters:
        print("  No letters could be loaded. Check your internet connection.")
        sys.exit(1)

    if args.source:
        all_letters = [l for l in all_letters if any(
            s["id"] == args.source and l["source"] == s["title"]
            for s in SOURCES
        )]
        if not all_letters:
            print(f"  No letters found for source '{args.source}'.")
            sys.exit(1)

    count = min(args.count, len(all_letters))
    chosen = random.sample(all_letters, count)

    for letter in chosen:
        display_letter(letter)


if __name__ == "__main__":
    main()