You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

550 lines
20 KiB
Python

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

#!/usr/bin/env python3
"""
Download and parse love letters from Project Gutenberg.
This script fetches letter collections from Gutenberg, extracts individual
letters, and saves them as JSON files in the letters/ directory. Run this
once (or with --force to re-download) to populate the data that
love_letters.py reads.
Usage:
python3 download_letters.py # download all sources
python3 download_letters.py --force # re-download everything
python3 download_letters.py --list # show available sources
"""
import json
import os
import re
import sys
import urllib.request
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
LETTERS_DIR = os.path.join(SCRIPT_DIR, "letters")
SOURCES = [
{
"id": "henry_viii",
"title": "The Love Letters of Henry VIII to Anne Boleyn",
"author": "Henry VIII",
"recipient": "Anne Boleyn",
"year": "c. 15271528",
"url": "https://www.gutenberg.org/cache/epub/32155/pg32155.txt",
"gutenberg_id": 32155,
},
{
"id": "wollstonecraft",
"title": "The Love Letters of Mary Wollstonecraft to Gilbert Imlay",
"author": "Mary Wollstonecraft",
"recipient": "Gilbert Imlay",
"year": "17931795",
"url": "https://www.gutenberg.org/cache/epub/34413/pg34413.txt",
"gutenberg_id": 34413,
},
{
"id": "abelard_heloise",
"title": "Letters of Abelard and Heloise",
"author": "Abelard & Heloise",
"recipient": "each other",
"year": "12th century",
"url": "https://www.gutenberg.org/cache/epub/35977/pg35977.txt",
"gutenberg_id": 35977,
},
{
"id": "napoleon",
"title": "Napoleon's Letters to Josephine",
"author": "Napoleon Bonaparte",
"recipient": "Josephine",
"year": "17961812",
"url": "https://www.gutenberg.org/cache/epub/37499/pg37499.txt",
"gutenberg_id": 37499,
},
{
"id": "keats_brawne",
"title": "Letters of John Keats to Fanny Brawne",
"author": "John Keats",
"recipient": "Fanny Brawne",
"year": "18191820",
"url": "https://www.gutenberg.org/cache/epub/60433/pg60433.txt",
"gutenberg_id": 60433,
},
{
"id": "browning",
"title": "The Letters of Robert Browning and Elizabeth Barrett Barrett, Vol. 1",
"author": "Robert Browning & Elizabeth Barrett Browning",
"recipient": "each other",
"year": "18451846",
"url": "https://www.gutenberg.org/cache/epub/16182/pg16182.txt",
"gutenberg_id": 16182,
},
{
"id": "browning_vol2",
"title": "The Letters of Robert Browning and Elizabeth Barrett Barrett, Vol. 2",
"author": "Robert Browning & Elizabeth Barrett Browning",
"recipient": "each other",
"year": "18451846",
"url": "https://www.gutenberg.org/cache/epub/73891/pg73891.txt",
"gutenberg_id": 73891,
},
{
"id": "burns_clarinda",
"title": "Letters of Robert Burns to Clarinda",
"author": "Robert Burns",
"recipient": "Clarinda (Agnes McLehose)",
"year": "17871794",
"url": "https://www.gutenberg.org/cache/epub/9863/pg9863.txt",
"gutenberg_id": 9863,
},
{
"id": "dorothy_osborne",
"title": "The Love Letters of Dorothy Osborne to Sir William Temple",
"author": "Dorothy Osborne",
"recipient": "Sir William Temple",
"year": "16521654",
"url": "https://www.gutenberg.org/cache/epub/12544/pg12544.txt",
"gutenberg_id": 12544,
},
{
"id": "beethoven",
"title": "Beethoven's Letters 1790-1826, Volume 1",
"author": "Ludwig van Beethoven",
"recipient": "various (love letters selected)",
"year": "17901826",
"url": "https://www.gutenberg.org/cache/epub/13065/pg13065.txt",
"gutenberg_id": 13065,
},
{
"id": "mozart",
"title": "The Letters of Wolfgang Amadeus Mozart, Volume 1",
"author": "Wolfgang Amadeus Mozart",
"recipient": "various (love letters selected)",
"year": "17691791",
"url": "https://www.gutenberg.org/cache/epub/5307/pg5307.txt",
"gutenberg_id": 5307,
},
]
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def download_text(url: str) -> str:
"""Download a plain-text file from Project Gutenberg."""
req = urllib.request.Request(url, headers={"User-Agent": "LoveLettersApp/1.0"})
with urllib.request.urlopen(req, timeout=30) as resp:
return resp.read().decode("utf-8", errors="replace")
def strip_gutenberg(text: str) -> str:
"""Remove Project Gutenberg header and footer boilerplate."""
for marker in [
"*** START OF THE PROJECT GUTENBERG EBOOK",
"*** START OF THIS PROJECT GUTENBERG EBOOK",
"***START OF THE PROJECT GUTENBERG EBOOK",
]:
idx = text.find(marker)
if idx != -1:
nl = text.find("\n", idx)
text = text[nl + 1:] if nl != -1 else text[idx + len(marker):]
break
for marker in [
"*** END OF THE PROJECT GUTENBERG EBOOK",
"*** END OF THIS PROJECT GUTENBERG EBOOK",
"***END OF THE PROJECT GUTENBERG EBOOK",
"End of the Project Gutenberg EBook",
"End of Project Gutenberg",
]:
idx = text.find(marker)
if idx != -1:
text = text[:idx]
break
return text.strip()
def normalize(text: str) -> str:
"""Normalize line endings."""
return text.replace("\r\n", "\n")
# ---------------------------------------------------------------------------
# Per-source extractors
# ---------------------------------------------------------------------------
def extract_henry_viii(text: str) -> list[dict]:
text = normalize(strip_gutenberg(text))
parts = re.split(
r"\n{2,}(?=Letter\s+(?:First|Second|Third|Fourth|Fifth|Sixth|Seventh|"
r"Eighth|Ninth|Tenth|Eleventh|Twelfth|Thirteenth|Fourteenth|"
r"Fifteenth|Sixteenth|Seventeenth|Eighteenth)\b)",
text,
)
letters = []
for part in parts:
part = part.strip()
m = re.match(r"(Letter\s+\w+)(?:\s+.*?)?\n", part, re.IGNORECASE)
if not m or len(part) < 80:
continue
heading = m.group(1)
body = part[m.end():].strip()
for tag in ["\nNotes\n", "\nNOTES\n"]:
idx = body.find(tag)
if idx != -1:
body = body[:idx].strip()
author, recipient = "Henry VIII", "Anne Boleyn"
if "Boleyn to" in part[:200]:
author, recipient = "Anne Boleyn", "Cardinal Wolsey"
if len(body) > 50:
letters.append({
"heading": heading, "body": body,
"author": author, "recipient": recipient,
"source": "The Love Letters of Henry VIII to Anne Boleyn",
"period": "c. 15271528",
})
return letters
def extract_wollstonecraft(text: str) -> list[dict]:
text = normalize(strip_gutenberg(text))
parts = re.split(r"\n{2,}(?=LETTER\s+[IVXLC0-9]+\.?\s*\n)", text, flags=re.IGNORECASE)
letters = []
for part in parts:
part = part.strip()
m = re.match(r"(LETTER\s+[IVXLC0-9]+\.?)\s*\n", part, re.IGNORECASE)
if not m or len(part) < 80:
continue
body = part[m.end():].strip()
if len(body) > 50:
letters.append({
"heading": m.group(1), "body": body,
"author": "Mary Wollstonecraft", "recipient": "Gilbert Imlay",
"source": "The Love Letters of Mary Wollstonecraft to Gilbert Imlay",
"period": "17931795",
})
return letters
def extract_abelard_heloise(text: str) -> list[dict]:
text = normalize(strip_gutenberg(text))
parts = re.split(r"\n{2,}(?=LETTER\s+[IVXLC0-9]+[.:]?\s*\n)", text, flags=re.IGNORECASE)
letters = []
for part in parts:
part = part.strip()
m = re.match(r"(LETTER\s+[IVXLC0-9]+[.:]?)\s*\n", part, re.IGNORECASE)
if not m or len(part) < 120:
continue
body = part[m.end():].strip()
author, recipient = "Abelard & Heloise", "each other"
lower = body[:300].lower()
if "heloise to abelard" in lower:
author, recipient = "Heloise", "Abelard"
elif "abelard to heloise" in lower:
author, recipient = "Abelard", "Heloise"
if len(body) > 50:
letters.append({
"heading": m.group(1), "body": body,
"author": author, "recipient": recipient,
"source": "Letters of Abelard and Heloise",
"period": "12th century",
})
return letters
def extract_napoleon(text: str) -> list[dict]:
text = normalize(strip_gutenberg(text))
parts = re.split(r"\n{2,}(?=No\.\s*\d+\.\s*\n)", text)
letters = []
for part in parts:
part = part.strip()
m = re.match(r"(No\.\s*\d+\.)\s*\n", part)
if not m or len(part) < 100:
continue
body = part[m.end():].strip()
if len(body) > 80:
letters.append({
"heading": m.group(1), "body": body,
"author": "Napoleon Bonaparte", "recipient": "Josephine",
"source": "Napoleon's Letters to Josephine, 17961812",
"period": "17961812",
})
return letters
def extract_keats_brawne(text: str) -> list[dict]:
text = normalize(strip_gutenberg(text))
parts = re.split(r"\n{2,}(?=[IVXLC]+\.\s*\n)", text)
letters = []
for part in parts:
part = part.strip()
m = re.match(r"([IVXLC]+)\.\s*\n", part)
if not m or len(part) < 100:
continue
body = part[m.end():].strip()
if len(body) > 50:
letters.append({
"heading": f"Letter {m.group(1)}", "body": body,
"author": "John Keats", "recipient": "Fanny Brawne",
"source": "Letters of John Keats to Fanny Brawne",
"period": "18191820",
})
return letters
def _extract_browning(text: str, vol_label: str) -> list[dict]:
"""Extract letters from a Browning correspondence volume."""
text = normalize(strip_gutenberg(text))
# Split on _R.B. to E.B.B._ or _E.B.B. to R.B._
parts = re.split(r"\n{2,}(?=_(?:R\.B\. to E\.B\.B\.|E\.B\.B\. to R\.B\.)_)", text)
letters = []
for part in parts:
part = part.strip()
m = re.match(r"_(R\.B\. to E\.B\.B\.|E\.B\.B\. to R\.B\.)_\s*\n", part)
if not m or len(part) < 100:
continue
direction = m.group(1)
body = part[m.end():].strip()
if "R.B. to E.B.B." in direction:
author = "Robert Browning"
recipient = "Elizabeth Barrett Browning"
else:
author = "Elizabeth Barrett Browning"
recipient = "Robert Browning"
if len(body) > 50:
letters.append({
"heading": direction, "body": body,
"author": author, "recipient": recipient,
"source": f"The Letters of Robert Browning and Elizabeth Barrett Barrett, {vol_label}",
"period": "18451846",
})
return letters
def extract_browning(text: str) -> list[dict]:
return _extract_browning(text, "Vol. 1")
def extract_browning_vol2(text: str) -> list[dict]:
return _extract_browning(text, "Vol. 2")
def extract_burns_clarinda(text: str) -> list[dict]:
text = normalize(strip_gutenberg(text))
# Find the "LETTERS TO CLARINDA" section
start_idx = text.find("LETTERS TO CLARINDA")
if start_idx == -1:
return []
section = text[start_idx:]
# End at next major section (all caps heading after blank lines)
end_match = re.search(r"\n{3,}[A-Z][A-Z ]{10,}\n", section[100:])
if end_match:
section = section[:100 + end_match.start()]
parts = re.split(r"\n{2,}(?=[IVXLC]+\.\s*\n)", section)
letters = []
for part in parts:
part = part.strip()
m = re.match(r"([IVXLC]+)\.\s*\n", part)
if not m or len(part) < 80:
continue
body = part[m.end():].strip()
if len(body) > 50:
letters.append({
"heading": f"Letter {m.group(1)}", "body": body,
"author": "Robert Burns", "recipient": "Clarinda (Agnes McLehose)",
"source": "Letters of Robert Burns to Clarinda",
"period": "17871794",
})
return letters
def extract_dorothy_osborne(text: str) -> list[dict]:
text = normalize(strip_gutenberg(text))
# Letters start with "SIR,--" after editorial commentary
# Split by looking backwards from each "SIR,--" to find the date/heading
letters = []
# Find all "SIR,--" occurrences
sir_positions = [m.start() for m in re.finditer(r"^SIR,--", text, re.MULTILINE)]
for i, pos in enumerate(sir_positions):
# Look for a date line just before the salutation
preceding = text[max(0, pos - 200):pos]
date_match = re.search(r"\n\n_([^_]+)_\.?\s*\n\s*$", preceding)
heading = date_match.group(1).strip() if date_match else ""
# Letter body extends to the next editorial section or next SIR
if i + 1 < len(sir_positions):
end = sir_positions[i + 1]
# Try to find where editorial notes begin (usually with _Letter)
editorial = re.search(r"\n_Letter\s+[IVXLC]+\._", text[pos:end])
if editorial:
end = pos + editorial.start()
else:
end = len(text)
body = text[pos:end].strip()
# Trim trailing editorial notes (paragraphs starting with special patterns)
body = re.split(r"\n\n(?=_[A-Z])", body)[0].strip()
if len(body) > 80:
letters.append({
"heading": heading if heading else f"Letter {i + 1}",
"body": body,
"author": "Dorothy Osborne", "recipient": "Sir William Temple",
"source": "The Love Letters of Dorothy Osborne to Sir William Temple",
"period": "16521654",
})
return letters
def extract_beethoven(text: str) -> list[dict]:
text = normalize(strip_gutenberg(text))
# Letters are headed "N.\n\nTO ..." where N is a number
parts = re.split(r"\n{2,}(?=\d+\.\s*\n\s*\nTO\s)", text)
letters = []
# Keywords to identify love/romantic letters
love_keywords = [
"immortal beloved", "my angel", "my love", "beloved",
"my heart", "kiss", "embrace", "love you",
"sweetheart", "giulietta", "guicciardi",
"josephine brunsvik", "bettina", "brentano",
"amalie sebald", "my all", "my second self",
"ardently", "passionately", "tenderly yours",
]
for part in parts:
part = part.strip()
m = re.match(r"(\d+)\.\s*\n\s*\n(TO\s+.+?)(?:\n|$)", part)
if not m or len(part) < 100:
continue
num = m.group(1)
to_line = m.group(2).strip()
body = part[m.end():].strip()
full_text = (to_line + " " + body).lower()
# Only include letters with romantic content
if any(kw in full_text for kw in love_keywords):
letters.append({
"heading": f"No. {num}{to_line}",
"body": body,
"author": "Ludwig van Beethoven",
"recipient": to_line.replace("TO ", "").strip("."),
"source": "Beethoven's Letters 17901826",
"period": "17901826",
})
return letters
def extract_mozart(text: str) -> list[dict]:
text = normalize(strip_gutenberg(text))
# Mozart's letters use numbered sections
parts = re.split(r"\n{2,}(?=\d+\.\s*\n)", text)
letters = []
love_keywords = [
"my love", "kiss", "beloved", "my heart",
"my dear wife", "constanze", "my darling",
"embrace you", "tender", "passionately",
"aloysia", "my dearest wife",
]
for part in parts:
part = part.strip()
m = re.match(r"(\d+)\.\s*\n", part)
if not m or len(part) < 100:
continue
num = m.group(1)
body = part[m.end():].strip()
# Extract TO line if present
to_match = re.match(r"(TO\s+.+?)(?:\n|$)", body)
to_line = to_match.group(1).strip() if to_match else ""
full_text = body.lower()
if any(kw in full_text for kw in love_keywords):
recipient = to_line.replace("TO ", "").strip(".") if to_line else "Constanze Mozart"
letters.append({
"heading": f"No. {num}" + (f"{to_line}" if to_line else ""),
"body": body,
"author": "Wolfgang Amadeus Mozart",
"recipient": recipient,
"source": "The Letters of Wolfgang Amadeus Mozart",
"period": "17691791",
})
return letters
EXTRACTORS = {
"henry_viii": extract_henry_viii,
"wollstonecraft": extract_wollstonecraft,
"abelard_heloise": extract_abelard_heloise,
"napoleon": extract_napoleon,
"keats_brawne": extract_keats_brawne,
"browning": extract_browning,
"browning_vol2": extract_browning_vol2,
"burns_clarinda": extract_burns_clarinda,
"dorothy_osborne": extract_dorothy_osborne,
"beethoven": extract_beethoven,
"mozart": extract_mozart,
}
# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
def download_source(source: dict, force: bool = False) -> int:
"""Download, parse, and save letters for one source. Returns letter count."""
out_path = os.path.join(LETTERS_DIR, f"{source['id']}.json")
if os.path.exists(out_path) and not force:
existing = json.load(open(out_path, "r", encoding="utf-8"))
return len(existing)
print(f" ⬇ Downloading: {source['title']}", flush=True)
try:
raw = download_text(source["url"])
except Exception as e:
print(f" ⚠ Failed: {e}")
return 0
extractor = EXTRACTORS.get(source["id"])
if extractor is None:
print(f" ⚠ No extractor for {source['id']}")
return 0
letters = extractor(raw)
if not letters:
print(f" ⚠ No letters extracted from {source['title']}")
return 0
os.makedirs(LETTERS_DIR, exist_ok=True)
with open(out_path, "w", encoding="utf-8") as f:
json.dump(letters, f, ensure_ascii=False, indent=2)
print(f"{len(letters)} letters saved → letters/{source['id']}.json")
return len(letters)
def main() -> None:
import argparse
parser = argparse.ArgumentParser(description="Download love letters from Project Gutenberg.")
parser.add_argument("--force", action="store_true", help="re-download all sources")
parser.add_argument("--list", action="store_true", help="list available sources")
args = parser.parse_args()
if args.list:
print("\n Available sources:\n")
for i, src in enumerate(SOURCES, 1):
print(f" {i:2}. [{src['id']}] {src['title']}")
print(f" {src['author']}{src['recipient']} ({src['year']})")
print(f" gutenberg.org/ebooks/{src['gutenberg_id']}")
print()
return
print("\n 📥 Downloading love letters from Project Gutenberg…\n")
total = 0
for source in SOURCES:
count = download_source(source, force=args.force)
total += count
print(f"\n 📬 Total: {total} letters in {LETTERS_DIR}/\n")
if __name__ == "__main__":
main()