letters/download_letters.py

#!/usr/bin/env python3
"""
Download and parse love letters from Project Gutenberg.

This script fetches letter collections from Gutenberg, extracts individual
letters, and saves them as JSON files in the letters/ directory. Run this
once (or with --force to re-download) to populate the data that
love_letters.py reads.

Usage:
    python3 download_letters.py           # download all sources
    python3 download_letters.py --force   # re-download everything
    python3 download_letters.py --list    # show available sources
"""

import json
import os
import re
import sys
import urllib.request

SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
LETTERS_DIR = os.path.join(SCRIPT_DIR, "letters")

SOURCES = [
    {
        "id": "henry_viii",
        "title": "The Love Letters of Henry VIII to Anne Boleyn",
        "author": "Henry VIII",
        "recipient": "Anne Boleyn",
        "year": "c. 1527–1528",
        "url": "https://www.gutenberg.org/cache/epub/32155/pg32155.txt",
        "gutenberg_id": 32155,
    },
    {
        "id": "wollstonecraft",
        "title": "The Love Letters of Mary Wollstonecraft to Gilbert Imlay",
        "author": "Mary Wollstonecraft",
        "recipient": "Gilbert Imlay",
        "year": "1793–1795",
        "url": "https://www.gutenberg.org/cache/epub/34413/pg34413.txt",
        "gutenberg_id": 34413,
    },
    {
        "id": "abelard_heloise",
        "title": "Letters of Abelard and Heloise",
        "author": "Abelard & Heloise",
        "recipient": "each other",
        "year": "12th century",
        "url": "https://www.gutenberg.org/cache/epub/35977/pg35977.txt",
        "gutenberg_id": 35977,
    },
    {
        "id": "napoleon",
        "title": "Napoleon's Letters to Josephine",
        "author": "Napoleon Bonaparte",
        "recipient": "Josephine",
        "year": "1796–1812",
        "url": "https://www.gutenberg.org/cache/epub/37499/pg37499.txt",
        "gutenberg_id": 37499,
    },
    {
        "id": "keats_brawne",
        "title": "Letters of John Keats to Fanny Brawne",
        "author": "John Keats",
        "recipient": "Fanny Brawne",
        "year": "1819–1820",
        "url": "https://www.gutenberg.org/cache/epub/60433/pg60433.txt",
        "gutenberg_id": 60433,
    },
    {
        "id": "browning",
        "title": "The Letters of Robert Browning and Elizabeth Barrett Barrett, Vol. 1",
        "author": "Robert Browning & Elizabeth Barrett Browning",
        "recipient": "each other",
        "year": "1845–1846",
        "url": "https://www.gutenberg.org/cache/epub/16182/pg16182.txt",
        "gutenberg_id": 16182,
    },
    {
        "id": "browning_vol2",
        "title": "The Letters of Robert Browning and Elizabeth Barrett Barrett, Vol. 2",
        "author": "Robert Browning & Elizabeth Barrett Browning",
        "recipient": "each other",
        "year": "1845–1846",
        "url": "https://www.gutenberg.org/cache/epub/73891/pg73891.txt",
        "gutenberg_id": 73891,
    },
    {
        "id": "burns_clarinda",
        "title": "Letters of Robert Burns to Clarinda",
        "author": "Robert Burns",
        "recipient": "Clarinda (Agnes McLehose)",
        "year": "1787–1794",
        "url": "https://www.gutenberg.org/cache/epub/9863/pg9863.txt",
        "gutenberg_id": 9863,
    },
    {
        "id": "dorothy_osborne",
        "title": "The Love Letters of Dorothy Osborne to Sir William Temple",
        "author": "Dorothy Osborne",
        "recipient": "Sir William Temple",
        "year": "1652–1654",
        "url": "https://www.gutenberg.org/cache/epub/12544/pg12544.txt",
        "gutenberg_id": 12544,
    },
    {
        "id": "beethoven",
        "title": "Beethoven's Letters 1790-1826, Volume 1",
        "author": "Ludwig van Beethoven",
        "recipient": "various (love letters selected)",
        "year": "1790–1826",
        "url": "https://www.gutenberg.org/cache/epub/13065/pg13065.txt",
        "gutenberg_id": 13065,
    },
    {
        "id": "mozart",
        "title": "The Letters of Wolfgang Amadeus Mozart, Volume 1",
        "author": "Wolfgang Amadeus Mozart",
        "recipient": "various (love letters selected)",
        "year": "1769–1791",
        "url": "https://www.gutenberg.org/cache/epub/5307/pg5307.txt",
        "gutenberg_id": 5307,
    },
]


# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------

def download_text(url: str) -> str:
    """Download a plain-text file from Project Gutenberg."""
    req = urllib.request.Request(url, headers={"User-Agent": "LoveLettersApp/1.0"})
    with urllib.request.urlopen(req, timeout=30) as resp:
        return resp.read().decode("utf-8", errors="replace")


def strip_gutenberg(text: str) -> str:
    """Remove Project Gutenberg header and footer boilerplate."""
    for marker in [
        "*** START OF THE PROJECT GUTENBERG EBOOK",
        "*** START OF THIS PROJECT GUTENBERG EBOOK",
        "***START OF THE PROJECT GUTENBERG EBOOK",
    ]:
        idx = text.find(marker)
        if idx != -1:
            nl = text.find("\n", idx)
            text = text[nl + 1:] if nl != -1 else text[idx + len(marker):]
            break

    for marker in [
        "*** END OF THE PROJECT GUTENBERG EBOOK",
        "*** END OF THIS PROJECT GUTENBERG EBOOK",
        "***END OF THE PROJECT GUTENBERG EBOOK",
        "End of the Project Gutenberg EBook",
        "End of Project Gutenberg",
    ]:
        idx = text.find(marker)
        if idx != -1:
            text = text[:idx]
            break

    return text.strip()


def normalize(text: str) -> str:
    """Normalize line endings."""
    return text.replace("\r\n", "\n")


# ---------------------------------------------------------------------------
# Per-source extractors
# ---------------------------------------------------------------------------

def extract_henry_viii(text: str) -> list[dict]:
    text = normalize(strip_gutenberg(text))
    parts = re.split(
        r"\n{2,}(?=Letter\s+(?:First|Second|Third|Fourth|Fifth|Sixth|Seventh|"
        r"Eighth|Ninth|Tenth|Eleventh|Twelfth|Thirteenth|Fourteenth|"
        r"Fifteenth|Sixteenth|Seventeenth|Eighteenth)\b)",
        text,
    )
    letters = []
    for part in parts:
        part = part.strip()
        m = re.match(r"(Letter\s+\w+)(?:\s+.*?)?\n", part, re.IGNORECASE)
        if not m or len(part) < 80:
            continue
        heading = m.group(1)
        body = part[m.end():].strip()
        for tag in ["\nNotes\n", "\nNOTES\n"]:
            idx = body.find(tag)
            if idx != -1:
                body = body[:idx].strip()
        author, recipient = "Henry VIII", "Anne Boleyn"
        if "Boleyn to" in part[:200]:
            author, recipient = "Anne Boleyn", "Cardinal Wolsey"
        if len(body) > 50:
            letters.append({
                "heading": heading, "body": body,
                "author": author, "recipient": recipient,
                "source": "The Love Letters of Henry VIII to Anne Boleyn",
                "period": "c. 1527–1528",
            })
    return letters


def extract_wollstonecraft(text: str) -> list[dict]:
    text = normalize(strip_gutenberg(text))
    parts = re.split(r"\n{2,}(?=LETTER\s+[IVXLC0-9]+\.?\s*\n)", text, flags=re.IGNORECASE)
    letters = []
    for part in parts:
        part = part.strip()
        m = re.match(r"(LETTER\s+[IVXLC0-9]+\.?)\s*\n", part, re.IGNORECASE)
        if not m or len(part) < 80:
            continue
        body = part[m.end():].strip()
        if len(body) > 50:
            letters.append({
                "heading": m.group(1), "body": body,
                "author": "Mary Wollstonecraft", "recipient": "Gilbert Imlay",
                "source": "The Love Letters of Mary Wollstonecraft to Gilbert Imlay",
                "period": "1793–1795",
            })
    return letters


def extract_abelard_heloise(text: str) -> list[dict]:
    text = normalize(strip_gutenberg(text))
    parts = re.split(r"\n{2,}(?=LETTER\s+[IVXLC0-9]+[.:]?\s*\n)", text, flags=re.IGNORECASE)
    letters = []
    for part in parts:
        part = part.strip()
        m = re.match(r"(LETTER\s+[IVXLC0-9]+[.:]?)\s*\n", part, re.IGNORECASE)
        if not m or len(part) < 120:
            continue
        body = part[m.end():].strip()
        author, recipient = "Abelard & Heloise", "each other"
        lower = body[:300].lower()
        if "heloise to abelard" in lower:
            author, recipient = "Heloise", "Abelard"
        elif "abelard to heloise" in lower:
            author, recipient = "Abelard", "Heloise"
        if len(body) > 50:
            letters.append({
                "heading": m.group(1), "body": body,
                "author": author, "recipient": recipient,
                "source": "Letters of Abelard and Heloise",
                "period": "12th century",
            })
    return letters


def extract_napoleon(text: str) -> list[dict]:
    text = normalize(strip_gutenberg(text))
    parts = re.split(r"\n{2,}(?=No\.\s*\d+\.\s*\n)", text)
    letters = []
    for part in parts:
        part = part.strip()
        m = re.match(r"(No\.\s*\d+\.)\s*\n", part)
        if not m or len(part) < 100:
            continue
        body = part[m.end():].strip()
        if len(body) > 80:
            letters.append({
                "heading": m.group(1), "body": body,
                "author": "Napoleon Bonaparte", "recipient": "Josephine",
                "source": "Napoleon's Letters to Josephine, 1796–1812",
                "period": "1796–1812",
            })
    return letters


def extract_keats_brawne(text: str) -> list[dict]:
    text = normalize(strip_gutenberg(text))
    parts = re.split(r"\n{2,}(?=[IVXLC]+\.\s*\n)", text)
    letters = []
    for part in parts:
        part = part.strip()
        m = re.match(r"([IVXLC]+)\.\s*\n", part)
        if not m or len(part) < 100:
            continue
        body = part[m.end():].strip()
        if len(body) > 50:
            letters.append({
                "heading": f"Letter {m.group(1)}", "body": body,
                "author": "John Keats", "recipient": "Fanny Brawne",
                "source": "Letters of John Keats to Fanny Brawne",
                "period": "1819–1820",
            })
    return letters


def _extract_browning(text: str, vol_label: str) -> list[dict]:
    """Extract letters from a Browning correspondence volume."""
    text = normalize(strip_gutenberg(text))
    # Split on _R.B. to E.B.B._ or _E.B.B. to R.B._
    parts = re.split(r"\n{2,}(?=_(?:R\.B\. to E\.B\.B\.|E\.B\.B\. to R\.B\.)_)", text)
    letters = []
    for part in parts:
        part = part.strip()
        m = re.match(r"_(R\.B\. to E\.B\.B\.|E\.B\.B\. to R\.B\.)_\s*\n", part)
        if not m or len(part) < 100:
            continue
        direction = m.group(1)
        body = part[m.end():].strip()
        if "R.B. to E.B.B." in direction:
            author = "Robert Browning"
            recipient = "Elizabeth Barrett Browning"
        else:
            author = "Elizabeth Barrett Browning"
            recipient = "Robert Browning"
        if len(body) > 50:
            letters.append({
                "heading": direction, "body": body,
                "author": author, "recipient": recipient,
                "source": f"The Letters of Robert Browning and Elizabeth Barrett Barrett, {vol_label}",
                "period": "1845–1846",
            })
    return letters


def extract_browning(text: str) -> list[dict]:
    return _extract_browning(text, "Vol. 1")


def extract_browning_vol2(text: str) -> list[dict]:
    return _extract_browning(text, "Vol. 2")


def extract_burns_clarinda(text: str) -> list[dict]:
    text = normalize(strip_gutenberg(text))
    # Find the "LETTERS TO CLARINDA" section
    start_idx = text.find("LETTERS TO CLARINDA")
    if start_idx == -1:
        return []
    section = text[start_idx:]
    # End at next major section (all caps heading after blank lines)
    end_match = re.search(r"\n{3,}[A-Z][A-Z ]{10,}\n", section[100:])
    if end_match:
        section = section[:100 + end_match.start()]

    parts = re.split(r"\n{2,}(?=[IVXLC]+\.\s*\n)", section)
    letters = []
    for part in parts:
        part = part.strip()
        m = re.match(r"([IVXLC]+)\.\s*\n", part)
        if not m or len(part) < 80:
            continue
        body = part[m.end():].strip()
        if len(body) > 50:
            letters.append({
                "heading": f"Letter {m.group(1)}", "body": body,
                "author": "Robert Burns", "recipient": "Clarinda (Agnes McLehose)",
                "source": "Letters of Robert Burns to Clarinda",
                "period": "1787–1794",
            })
    return letters


def extract_dorothy_osborne(text: str) -> list[dict]:
    text = normalize(strip_gutenberg(text))
    # Letters start with "SIR,--" after editorial commentary
    # Split by looking backwards from each "SIR,--" to find the date/heading
    letters = []
    # Find all "SIR,--" occurrences
    sir_positions = [m.start() for m in re.finditer(r"^SIR,--", text, re.MULTILINE)]

    for i, pos in enumerate(sir_positions):
        # Look for a date line just before the salutation
        preceding = text[max(0, pos - 200):pos]
        date_match = re.search(r"\n\n_([^_]+)_\.?\s*\n\s*$", preceding)
        heading = date_match.group(1).strip() if date_match else ""

        # Letter body extends to the next editorial section or next SIR
        if i + 1 < len(sir_positions):
            end = sir_positions[i + 1]
            # Try to find where editorial notes begin (usually with _Letter)
            editorial = re.search(r"\n_Letter\s+[IVXLC]+\._", text[pos:end])
            if editorial:
                end = pos + editorial.start()
        else:
            end = len(text)

        body = text[pos:end].strip()
        # Trim trailing editorial notes (paragraphs starting with special patterns)
        body = re.split(r"\n\n(?=_[A-Z])", body)[0].strip()

        if len(body) > 80:
            letters.append({
                "heading": heading if heading else f"Letter {i + 1}",
                "body": body,
                "author": "Dorothy Osborne", "recipient": "Sir William Temple",
                "source": "The Love Letters of Dorothy Osborne to Sir William Temple",
                "period": "1652–1654",
            })
    return letters


def extract_beethoven(text: str) -> list[dict]:
    text = normalize(strip_gutenberg(text))
    # Letters are headed "N.\n\nTO ..." where N is a number
    parts = re.split(r"\n{2,}(?=\d+\.\s*\n\s*\nTO\s)", text)
    letters = []
    # Keywords to identify love/romantic letters
    love_keywords = [
        "immortal beloved", "my angel", "my love", "beloved",
        "my heart", "kiss", "embrace", "love you",
        "sweetheart", "giulietta", "guicciardi",
        "josephine brunsvik", "bettina", "brentano",
        "amalie sebald", "my all", "my second self",
        "ardently", "passionately", "tenderly yours",
    ]
    for part in parts:
        part = part.strip()
        m = re.match(r"(\d+)\.\s*\n\s*\n(TO\s+.+?)(?:\n|$)", part)
        if not m or len(part) < 100:
            continue
        num = m.group(1)
        to_line = m.group(2).strip()
        body = part[m.end():].strip()
        full_text = (to_line + " " + body).lower()
        # Only include letters with romantic content
        if any(kw in full_text for kw in love_keywords):
            letters.append({
                "heading": f"No. {num} — {to_line}",
                "body": body,
                "author": "Ludwig van Beethoven",
                "recipient": to_line.replace("TO ", "").strip("."),
                "source": "Beethoven's Letters 1790–1826",
                "period": "1790–1826",
            })
    return letters


def extract_mozart(text: str) -> list[dict]:
    text = normalize(strip_gutenberg(text))
    # Mozart's letters use numbered sections
    parts = re.split(r"\n{2,}(?=\d+\.\s*\n)", text)
    letters = []
    love_keywords = [
        "my love", "kiss", "beloved", "my heart",
        "my dear wife", "constanze", "my darling",
        "embrace you", "tender", "passionately",
        "aloysia", "my dearest wife",
    ]
    for part in parts:
        part = part.strip()
        m = re.match(r"(\d+)\.\s*\n", part)
        if not m or len(part) < 100:
            continue
        num = m.group(1)
        body = part[m.end():].strip()
        # Extract TO line if present
        to_match = re.match(r"(TO\s+.+?)(?:\n|$)", body)
        to_line = to_match.group(1).strip() if to_match else ""
        full_text = body.lower()
        if any(kw in full_text for kw in love_keywords):
            recipient = to_line.replace("TO ", "").strip(".") if to_line else "Constanze Mozart"
            letters.append({
                "heading": f"No. {num}" + (f" — {to_line}" if to_line else ""),
                "body": body,
                "author": "Wolfgang Amadeus Mozart",
                "recipient": recipient,
                "source": "The Letters of Wolfgang Amadeus Mozart",
                "period": "1769–1791",
            })
    return letters


EXTRACTORS = {
    "henry_viii": extract_henry_viii,
    "wollstonecraft": extract_wollstonecraft,
    "abelard_heloise": extract_abelard_heloise,
    "napoleon": extract_napoleon,
    "keats_brawne": extract_keats_brawne,
    "browning": extract_browning,
    "browning_vol2": extract_browning_vol2,
    "burns_clarinda": extract_burns_clarinda,
    "dorothy_osborne": extract_dorothy_osborne,
    "beethoven": extract_beethoven,
    "mozart": extract_mozart,
}


# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------

def download_source(source: dict, force: bool = False) -> int:
    """Download, parse, and save letters for one source. Returns letter count."""
    out_path = os.path.join(LETTERS_DIR, f"{source['id']}.json")
    if os.path.exists(out_path) and not force:
        existing = json.load(open(out_path, "r", encoding="utf-8"))
        return len(existing)

    print(f"  ⬇  Downloading: {source['title']}…", flush=True)
    try:
        raw = download_text(source["url"])
    except Exception as e:
        print(f"  ⚠  Failed: {e}")
        return 0

    extractor = EXTRACTORS.get(source["id"])
    if extractor is None:
        print(f"  ⚠  No extractor for {source['id']}")
        return 0

    letters = extractor(raw)
    if not letters:
        print(f"  ⚠  No letters extracted from {source['title']}")
        return 0

    os.makedirs(LETTERS_DIR, exist_ok=True)
    with open(out_path, "w", encoding="utf-8") as f:
        json.dump(letters, f, ensure_ascii=False, indent=2)

    print(f"  ✓  {len(letters)} letters saved → letters/{source['id']}.json")
    return len(letters)


def main() -> None:
    import argparse
    parser = argparse.ArgumentParser(description="Download love letters from Project Gutenberg.")
    parser.add_argument("--force", action="store_true", help="re-download all sources")
    parser.add_argument("--list", action="store_true", help="list available sources")
    args = parser.parse_args()

    if args.list:
        print("\n  Available sources:\n")
        for i, src in enumerate(SOURCES, 1):
            print(f"  {i:2}. [{src['id']}] {src['title']}")
            print(f"      {src['author']} → {src['recipient']} ({src['year']})")
            print(f"      gutenberg.org/ebooks/{src['gutenberg_id']}")
            print()
        return

    print("\n  📥 Downloading love letters from Project Gutenberg…\n")
    total = 0
    for source in SOURCES:
        count = download_source(source, force=args.force)
        total += count

    print(f"\n  📬 Total: {total} letters in {LETTERS_DIR}/\n")


if __name__ == "__main__":
    main()