#!/usr/bin/env python3 """ Download and parse love letters from Project Gutenberg. This script fetches letter collections from Gutenberg, extracts individual letters, and saves them as JSON files in the letters/ directory. Run this once (or with --force to re-download) to populate the data that love_letters.py reads. Usage: python3 download_letters.py # download all sources python3 download_letters.py --force # re-download everything python3 download_letters.py --list # show available sources """ import json import os import re import sys import urllib.request SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) LETTERS_DIR = os.path.join(SCRIPT_DIR, "letters") SOURCES = [ { "id": "henry_viii", "title": "The Love Letters of Henry VIII to Anne Boleyn", "author": "Henry VIII", "recipient": "Anne Boleyn", "year": "c. 1527–1528", "url": "https://www.gutenberg.org/cache/epub/32155/pg32155.txt", "gutenberg_id": 32155, }, { "id": "wollstonecraft", "title": "The Love Letters of Mary Wollstonecraft to Gilbert Imlay", "author": "Mary Wollstonecraft", "recipient": "Gilbert Imlay", "year": "1793–1795", "url": "https://www.gutenberg.org/cache/epub/34413/pg34413.txt", "gutenberg_id": 34413, }, { "id": "abelard_heloise", "title": "Letters of Abelard and Heloise", "author": "Abelard & Heloise", "recipient": "each other", "year": "12th century", "url": "https://www.gutenberg.org/cache/epub/35977/pg35977.txt", "gutenberg_id": 35977, }, { "id": "napoleon", "title": "Napoleon's Letters to Josephine", "author": "Napoleon Bonaparte", "recipient": "Josephine", "year": "1796–1812", "url": "https://www.gutenberg.org/cache/epub/37499/pg37499.txt", "gutenberg_id": 37499, }, { "id": "keats_brawne", "title": "Letters of John Keats to Fanny Brawne", "author": "John Keats", "recipient": "Fanny Brawne", "year": "1819–1820", "url": "https://www.gutenberg.org/cache/epub/60433/pg60433.txt", "gutenberg_id": 60433, }, { "id": "browning", "title": "The Letters of Robert Browning and Elizabeth Barrett Barrett, Vol. 1", "author": "Robert Browning & Elizabeth Barrett Browning", "recipient": "each other", "year": "1845–1846", "url": "https://www.gutenberg.org/cache/epub/16182/pg16182.txt", "gutenberg_id": 16182, }, { "id": "browning_vol2", "title": "The Letters of Robert Browning and Elizabeth Barrett Barrett, Vol. 2", "author": "Robert Browning & Elizabeth Barrett Browning", "recipient": "each other", "year": "1845–1846", "url": "https://www.gutenberg.org/cache/epub/73891/pg73891.txt", "gutenberg_id": 73891, }, { "id": "burns_clarinda", "title": "Letters of Robert Burns to Clarinda", "author": "Robert Burns", "recipient": "Clarinda (Agnes McLehose)", "year": "1787–1794", "url": "https://www.gutenberg.org/cache/epub/9863/pg9863.txt", "gutenberg_id": 9863, }, { "id": "dorothy_osborne", "title": "The Love Letters of Dorothy Osborne to Sir William Temple", "author": "Dorothy Osborne", "recipient": "Sir William Temple", "year": "1652–1654", "url": "https://www.gutenberg.org/cache/epub/12544/pg12544.txt", "gutenberg_id": 12544, }, { "id": "beethoven", "title": "Beethoven's Letters 1790-1826, Volume 1", "author": "Ludwig van Beethoven", "recipient": "various (love letters selected)", "year": "1790–1826", "url": "https://www.gutenberg.org/cache/epub/13065/pg13065.txt", "gutenberg_id": 13065, }, { "id": "mozart", "title": "The Letters of Wolfgang Amadeus Mozart, Volume 1", "author": "Wolfgang Amadeus Mozart", "recipient": "various (love letters selected)", "year": "1769–1791", "url": "https://www.gutenberg.org/cache/epub/5307/pg5307.txt", "gutenberg_id": 5307, }, ] # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- def download_text(url: str) -> str: """Download a plain-text file from Project Gutenberg.""" req = urllib.request.Request(url, headers={"User-Agent": "LoveLettersApp/1.0"}) with urllib.request.urlopen(req, timeout=30) as resp: return resp.read().decode("utf-8", errors="replace") def strip_gutenberg(text: str) -> str: """Remove Project Gutenberg header and footer boilerplate.""" for marker in [ "*** START OF THE PROJECT GUTENBERG EBOOK", "*** START OF THIS PROJECT GUTENBERG EBOOK", "***START OF THE PROJECT GUTENBERG EBOOK", ]: idx = text.find(marker) if idx != -1: nl = text.find("\n", idx) text = text[nl + 1:] if nl != -1 else text[idx + len(marker):] break for marker in [ "*** END OF THE PROJECT GUTENBERG EBOOK", "*** END OF THIS PROJECT GUTENBERG EBOOK", "***END OF THE PROJECT GUTENBERG EBOOK", "End of the Project Gutenberg EBook", "End of Project Gutenberg", ]: idx = text.find(marker) if idx != -1: text = text[:idx] break return text.strip() def normalize(text: str) -> str: """Normalize line endings.""" return text.replace("\r\n", "\n") # --------------------------------------------------------------------------- # Per-source extractors # --------------------------------------------------------------------------- def extract_henry_viii(text: str) -> list[dict]: text = normalize(strip_gutenberg(text)) parts = re.split( r"\n{2,}(?=Letter\s+(?:First|Second|Third|Fourth|Fifth|Sixth|Seventh|" r"Eighth|Ninth|Tenth|Eleventh|Twelfth|Thirteenth|Fourteenth|" r"Fifteenth|Sixteenth|Seventeenth|Eighteenth)\b)", text, ) letters = [] for part in parts: part = part.strip() m = re.match(r"(Letter\s+\w+)(?:\s+.*?)?\n", part, re.IGNORECASE) if not m or len(part) < 80: continue heading = m.group(1) body = part[m.end():].strip() for tag in ["\nNotes\n", "\nNOTES\n"]: idx = body.find(tag) if idx != -1: body = body[:idx].strip() author, recipient = "Henry VIII", "Anne Boleyn" if "Boleyn to" in part[:200]: author, recipient = "Anne Boleyn", "Cardinal Wolsey" if len(body) > 50: letters.append({ "heading": heading, "body": body, "author": author, "recipient": recipient, "source": "The Love Letters of Henry VIII to Anne Boleyn", "period": "c. 1527–1528", }) return letters def extract_wollstonecraft(text: str) -> list[dict]: text = normalize(strip_gutenberg(text)) parts = re.split(r"\n{2,}(?=LETTER\s+[IVXLC0-9]+\.?\s*\n)", text, flags=re.IGNORECASE) letters = [] for part in parts: part = part.strip() m = re.match(r"(LETTER\s+[IVXLC0-9]+\.?)\s*\n", part, re.IGNORECASE) if not m or len(part) < 80: continue body = part[m.end():].strip() if len(body) > 50: letters.append({ "heading": m.group(1), "body": body, "author": "Mary Wollstonecraft", "recipient": "Gilbert Imlay", "source": "The Love Letters of Mary Wollstonecraft to Gilbert Imlay", "period": "1793–1795", }) return letters def extract_abelard_heloise(text: str) -> list[dict]: text = normalize(strip_gutenberg(text)) parts = re.split(r"\n{2,}(?=LETTER\s+[IVXLC0-9]+[.:]?\s*\n)", text, flags=re.IGNORECASE) letters = [] for part in parts: part = part.strip() m = re.match(r"(LETTER\s+[IVXLC0-9]+[.:]?)\s*\n", part, re.IGNORECASE) if not m or len(part) < 120: continue body = part[m.end():].strip() author, recipient = "Abelard & Heloise", "each other" lower = body[:300].lower() if "heloise to abelard" in lower: author, recipient = "Heloise", "Abelard" elif "abelard to heloise" in lower: author, recipient = "Abelard", "Heloise" if len(body) > 50: letters.append({ "heading": m.group(1), "body": body, "author": author, "recipient": recipient, "source": "Letters of Abelard and Heloise", "period": "12th century", }) return letters def extract_napoleon(text: str) -> list[dict]: text = normalize(strip_gutenberg(text)) parts = re.split(r"\n{2,}(?=No\.\s*\d+\.\s*\n)", text) letters = [] for part in parts: part = part.strip() m = re.match(r"(No\.\s*\d+\.)\s*\n", part) if not m or len(part) < 100: continue body = part[m.end():].strip() if len(body) > 80: letters.append({ "heading": m.group(1), "body": body, "author": "Napoleon Bonaparte", "recipient": "Josephine", "source": "Napoleon's Letters to Josephine, 1796–1812", "period": "1796–1812", }) return letters def extract_keats_brawne(text: str) -> list[dict]: text = normalize(strip_gutenberg(text)) parts = re.split(r"\n{2,}(?=[IVXLC]+\.\s*\n)", text) letters = [] for part in parts: part = part.strip() m = re.match(r"([IVXLC]+)\.\s*\n", part) if not m or len(part) < 100: continue body = part[m.end():].strip() if len(body) > 50: letters.append({ "heading": f"Letter {m.group(1)}", "body": body, "author": "John Keats", "recipient": "Fanny Brawne", "source": "Letters of John Keats to Fanny Brawne", "period": "1819–1820", }) return letters def _extract_browning(text: str, vol_label: str) -> list[dict]: """Extract letters from a Browning correspondence volume.""" text = normalize(strip_gutenberg(text)) # Split on _R.B. to E.B.B._ or _E.B.B. to R.B._ parts = re.split(r"\n{2,}(?=_(?:R\.B\. to E\.B\.B\.|E\.B\.B\. to R\.B\.)_)", text) letters = [] for part in parts: part = part.strip() m = re.match(r"_(R\.B\. to E\.B\.B\.|E\.B\.B\. to R\.B\.)_\s*\n", part) if not m or len(part) < 100: continue direction = m.group(1) body = part[m.end():].strip() if "R.B. to E.B.B." in direction: author = "Robert Browning" recipient = "Elizabeth Barrett Browning" else: author = "Elizabeth Barrett Browning" recipient = "Robert Browning" if len(body) > 50: letters.append({ "heading": direction, "body": body, "author": author, "recipient": recipient, "source": f"The Letters of Robert Browning and Elizabeth Barrett Barrett, {vol_label}", "period": "1845–1846", }) return letters def extract_browning(text: str) -> list[dict]: return _extract_browning(text, "Vol. 1") def extract_browning_vol2(text: str) -> list[dict]: return _extract_browning(text, "Vol. 2") def extract_burns_clarinda(text: str) -> list[dict]: text = normalize(strip_gutenberg(text)) # Find the "LETTERS TO CLARINDA" section start_idx = text.find("LETTERS TO CLARINDA") if start_idx == -1: return [] section = text[start_idx:] # End at next major section (all caps heading after blank lines) end_match = re.search(r"\n{3,}[A-Z][A-Z ]{10,}\n", section[100:]) if end_match: section = section[:100 + end_match.start()] parts = re.split(r"\n{2,}(?=[IVXLC]+\.\s*\n)", section) letters = [] for part in parts: part = part.strip() m = re.match(r"([IVXLC]+)\.\s*\n", part) if not m or len(part) < 80: continue body = part[m.end():].strip() if len(body) > 50: letters.append({ "heading": f"Letter {m.group(1)}", "body": body, "author": "Robert Burns", "recipient": "Clarinda (Agnes McLehose)", "source": "Letters of Robert Burns to Clarinda", "period": "1787–1794", }) return letters def extract_dorothy_osborne(text: str) -> list[dict]: text = normalize(strip_gutenberg(text)) # Letters start with "SIR,--" after editorial commentary # Split by looking backwards from each "SIR,--" to find the date/heading letters = [] # Find all "SIR,--" occurrences sir_positions = [m.start() for m in re.finditer(r"^SIR,--", text, re.MULTILINE)] for i, pos in enumerate(sir_positions): # Look for a date line just before the salutation preceding = text[max(0, pos - 200):pos] date_match = re.search(r"\n\n_([^_]+)_\.?\s*\n\s*$", preceding) heading = date_match.group(1).strip() if date_match else "" # Letter body extends to the next editorial section or next SIR if i + 1 < len(sir_positions): end = sir_positions[i + 1] # Try to find where editorial notes begin (usually with _Letter) editorial = re.search(r"\n_Letter\s+[IVXLC]+\._", text[pos:end]) if editorial: end = pos + editorial.start() else: end = len(text) body = text[pos:end].strip() # Trim trailing editorial notes (paragraphs starting with special patterns) body = re.split(r"\n\n(?=_[A-Z])", body)[0].strip() if len(body) > 80: letters.append({ "heading": heading if heading else f"Letter {i + 1}", "body": body, "author": "Dorothy Osborne", "recipient": "Sir William Temple", "source": "The Love Letters of Dorothy Osborne to Sir William Temple", "period": "1652–1654", }) return letters def extract_beethoven(text: str) -> list[dict]: text = normalize(strip_gutenberg(text)) # Letters are headed "N.\n\nTO ..." where N is a number parts = re.split(r"\n{2,}(?=\d+\.\s*\n\s*\nTO\s)", text) letters = [] # Keywords to identify love/romantic letters love_keywords = [ "immortal beloved", "my angel", "my love", "beloved", "my heart", "kiss", "embrace", "love you", "sweetheart", "giulietta", "guicciardi", "josephine brunsvik", "bettina", "brentano", "amalie sebald", "my all", "my second self", "ardently", "passionately", "tenderly yours", ] for part in parts: part = part.strip() m = re.match(r"(\d+)\.\s*\n\s*\n(TO\s+.+?)(?:\n|$)", part) if not m or len(part) < 100: continue num = m.group(1) to_line = m.group(2).strip() body = part[m.end():].strip() full_text = (to_line + " " + body).lower() # Only include letters with romantic content if any(kw in full_text for kw in love_keywords): letters.append({ "heading": f"No. {num} — {to_line}", "body": body, "author": "Ludwig van Beethoven", "recipient": to_line.replace("TO ", "").strip("."), "source": "Beethoven's Letters 1790–1826", "period": "1790–1826", }) return letters def extract_mozart(text: str) -> list[dict]: text = normalize(strip_gutenberg(text)) # Mozart's letters use numbered sections parts = re.split(r"\n{2,}(?=\d+\.\s*\n)", text) letters = [] love_keywords = [ "my love", "kiss", "beloved", "my heart", "my dear wife", "constanze", "my darling", "embrace you", "tender", "passionately", "aloysia", "my dearest wife", ] for part in parts: part = part.strip() m = re.match(r"(\d+)\.\s*\n", part) if not m or len(part) < 100: continue num = m.group(1) body = part[m.end():].strip() # Extract TO line if present to_match = re.match(r"(TO\s+.+?)(?:\n|$)", body) to_line = to_match.group(1).strip() if to_match else "" full_text = body.lower() if any(kw in full_text for kw in love_keywords): recipient = to_line.replace("TO ", "").strip(".") if to_line else "Constanze Mozart" letters.append({ "heading": f"No. {num}" + (f" — {to_line}" if to_line else ""), "body": body, "author": "Wolfgang Amadeus Mozart", "recipient": recipient, "source": "The Letters of Wolfgang Amadeus Mozart", "period": "1769–1791", }) return letters EXTRACTORS = { "henry_viii": extract_henry_viii, "wollstonecraft": extract_wollstonecraft, "abelard_heloise": extract_abelard_heloise, "napoleon": extract_napoleon, "keats_brawne": extract_keats_brawne, "browning": extract_browning, "browning_vol2": extract_browning_vol2, "burns_clarinda": extract_burns_clarinda, "dorothy_osborne": extract_dorothy_osborne, "beethoven": extract_beethoven, "mozart": extract_mozart, } # --------------------------------------------------------------------------- # Main # --------------------------------------------------------------------------- def download_source(source: dict, force: bool = False) -> int: """Download, parse, and save letters for one source. Returns letter count.""" out_path = os.path.join(LETTERS_DIR, f"{source['id']}.json") if os.path.exists(out_path) and not force: existing = json.load(open(out_path, "r", encoding="utf-8")) return len(existing) print(f" ⬇ Downloading: {source['title']}…", flush=True) try: raw = download_text(source["url"]) except Exception as e: print(f" ⚠ Failed: {e}") return 0 extractor = EXTRACTORS.get(source["id"]) if extractor is None: print(f" ⚠ No extractor for {source['id']}") return 0 letters = extractor(raw) if not letters: print(f" ⚠ No letters extracted from {source['title']}") return 0 os.makedirs(LETTERS_DIR, exist_ok=True) with open(out_path, "w", encoding="utf-8") as f: json.dump(letters, f, ensure_ascii=False, indent=2) print(f" ✓ {len(letters)} letters saved → letters/{source['id']}.json") return len(letters) def main() -> None: import argparse parser = argparse.ArgumentParser(description="Download love letters from Project Gutenberg.") parser.add_argument("--force", action="store_true", help="re-download all sources") parser.add_argument("--list", action="store_true", help="list available sources") args = parser.parse_args() if args.list: print("\n Available sources:\n") for i, src in enumerate(SOURCES, 1): print(f" {i:2}. [{src['id']}] {src['title']}") print(f" {src['author']} → {src['recipient']} ({src['year']})") print(f" gutenberg.org/ebooks/{src['gutenberg_id']}") print() return print("\n 📥 Downloading love letters from Project Gutenberg…\n") total = 0 for source in SOURCES: count = download_source(source, force=args.force) total += count print(f"\n 📬 Total: {total} letters in {LETTERS_DIR}/\n") if __name__ == "__main__": main()