|
|
|
|
@ -0,0 +1,549 @@
|
|
|
|
|
#!/usr/bin/env python3
|
|
|
|
|
"""
|
|
|
|
|
Download and parse love letters from Project Gutenberg.
|
|
|
|
|
|
|
|
|
|
This script fetches letter collections from Gutenberg, extracts individual
|
|
|
|
|
letters, and saves them as JSON files in the letters/ directory. Run this
|
|
|
|
|
once (or with --force to re-download) to populate the data that
|
|
|
|
|
love_letters.py reads.
|
|
|
|
|
|
|
|
|
|
Usage:
|
|
|
|
|
python3 download_letters.py # download all sources
|
|
|
|
|
python3 download_letters.py --force # re-download everything
|
|
|
|
|
python3 download_letters.py --list # show available sources
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
import json
|
|
|
|
|
import os
|
|
|
|
|
import re
|
|
|
|
|
import sys
|
|
|
|
|
import urllib.request
|
|
|
|
|
|
|
|
|
|
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
|
|
|
|
|
LETTERS_DIR = os.path.join(SCRIPT_DIR, "letters")
|
|
|
|
|
|
|
|
|
|
SOURCES = [
|
|
|
|
|
{
|
|
|
|
|
"id": "henry_viii",
|
|
|
|
|
"title": "The Love Letters of Henry VIII to Anne Boleyn",
|
|
|
|
|
"author": "Henry VIII",
|
|
|
|
|
"recipient": "Anne Boleyn",
|
|
|
|
|
"year": "c. 1527–1528",
|
|
|
|
|
"url": "https://www.gutenberg.org/cache/epub/32155/pg32155.txt",
|
|
|
|
|
"gutenberg_id": 32155,
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"id": "wollstonecraft",
|
|
|
|
|
"title": "The Love Letters of Mary Wollstonecraft to Gilbert Imlay",
|
|
|
|
|
"author": "Mary Wollstonecraft",
|
|
|
|
|
"recipient": "Gilbert Imlay",
|
|
|
|
|
"year": "1793–1795",
|
|
|
|
|
"url": "https://www.gutenberg.org/cache/epub/34413/pg34413.txt",
|
|
|
|
|
"gutenberg_id": 34413,
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"id": "abelard_heloise",
|
|
|
|
|
"title": "Letters of Abelard and Heloise",
|
|
|
|
|
"author": "Abelard & Heloise",
|
|
|
|
|
"recipient": "each other",
|
|
|
|
|
"year": "12th century",
|
|
|
|
|
"url": "https://www.gutenberg.org/cache/epub/35977/pg35977.txt",
|
|
|
|
|
"gutenberg_id": 35977,
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"id": "napoleon",
|
|
|
|
|
"title": "Napoleon's Letters to Josephine",
|
|
|
|
|
"author": "Napoleon Bonaparte",
|
|
|
|
|
"recipient": "Josephine",
|
|
|
|
|
"year": "1796–1812",
|
|
|
|
|
"url": "https://www.gutenberg.org/cache/epub/37499/pg37499.txt",
|
|
|
|
|
"gutenberg_id": 37499,
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"id": "keats_brawne",
|
|
|
|
|
"title": "Letters of John Keats to Fanny Brawne",
|
|
|
|
|
"author": "John Keats",
|
|
|
|
|
"recipient": "Fanny Brawne",
|
|
|
|
|
"year": "1819–1820",
|
|
|
|
|
"url": "https://www.gutenberg.org/cache/epub/60433/pg60433.txt",
|
|
|
|
|
"gutenberg_id": 60433,
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"id": "browning",
|
|
|
|
|
"title": "The Letters of Robert Browning and Elizabeth Barrett Barrett, Vol. 1",
|
|
|
|
|
"author": "Robert Browning & Elizabeth Barrett Browning",
|
|
|
|
|
"recipient": "each other",
|
|
|
|
|
"year": "1845–1846",
|
|
|
|
|
"url": "https://www.gutenberg.org/cache/epub/16182/pg16182.txt",
|
|
|
|
|
"gutenberg_id": 16182,
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"id": "browning_vol2",
|
|
|
|
|
"title": "The Letters of Robert Browning and Elizabeth Barrett Barrett, Vol. 2",
|
|
|
|
|
"author": "Robert Browning & Elizabeth Barrett Browning",
|
|
|
|
|
"recipient": "each other",
|
|
|
|
|
"year": "1845–1846",
|
|
|
|
|
"url": "https://www.gutenberg.org/cache/epub/73891/pg73891.txt",
|
|
|
|
|
"gutenberg_id": 73891,
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"id": "burns_clarinda",
|
|
|
|
|
"title": "Letters of Robert Burns to Clarinda",
|
|
|
|
|
"author": "Robert Burns",
|
|
|
|
|
"recipient": "Clarinda (Agnes McLehose)",
|
|
|
|
|
"year": "1787–1794",
|
|
|
|
|
"url": "https://www.gutenberg.org/cache/epub/9863/pg9863.txt",
|
|
|
|
|
"gutenberg_id": 9863,
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"id": "dorothy_osborne",
|
|
|
|
|
"title": "The Love Letters of Dorothy Osborne to Sir William Temple",
|
|
|
|
|
"author": "Dorothy Osborne",
|
|
|
|
|
"recipient": "Sir William Temple",
|
|
|
|
|
"year": "1652–1654",
|
|
|
|
|
"url": "https://www.gutenberg.org/cache/epub/12544/pg12544.txt",
|
|
|
|
|
"gutenberg_id": 12544,
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"id": "beethoven",
|
|
|
|
|
"title": "Beethoven's Letters 1790-1826, Volume 1",
|
|
|
|
|
"author": "Ludwig van Beethoven",
|
|
|
|
|
"recipient": "various (love letters selected)",
|
|
|
|
|
"year": "1790–1826",
|
|
|
|
|
"url": "https://www.gutenberg.org/cache/epub/13065/pg13065.txt",
|
|
|
|
|
"gutenberg_id": 13065,
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"id": "mozart",
|
|
|
|
|
"title": "The Letters of Wolfgang Amadeus Mozart, Volume 1",
|
|
|
|
|
"author": "Wolfgang Amadeus Mozart",
|
|
|
|
|
"recipient": "various (love letters selected)",
|
|
|
|
|
"year": "1769–1791",
|
|
|
|
|
"url": "https://www.gutenberg.org/cache/epub/5307/pg5307.txt",
|
|
|
|
|
"gutenberg_id": 5307,
|
|
|
|
|
},
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
# Helpers
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
def download_text(url: str) -> str:
|
|
|
|
|
"""Download a plain-text file from Project Gutenberg."""
|
|
|
|
|
req = urllib.request.Request(url, headers={"User-Agent": "LoveLettersApp/1.0"})
|
|
|
|
|
with urllib.request.urlopen(req, timeout=30) as resp:
|
|
|
|
|
return resp.read().decode("utf-8", errors="replace")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def strip_gutenberg(text: str) -> str:
|
|
|
|
|
"""Remove Project Gutenberg header and footer boilerplate."""
|
|
|
|
|
for marker in [
|
|
|
|
|
"*** START OF THE PROJECT GUTENBERG EBOOK",
|
|
|
|
|
"*** START OF THIS PROJECT GUTENBERG EBOOK",
|
|
|
|
|
"***START OF THE PROJECT GUTENBERG EBOOK",
|
|
|
|
|
]:
|
|
|
|
|
idx = text.find(marker)
|
|
|
|
|
if idx != -1:
|
|
|
|
|
nl = text.find("\n", idx)
|
|
|
|
|
text = text[nl + 1:] if nl != -1 else text[idx + len(marker):]
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
for marker in [
|
|
|
|
|
"*** END OF THE PROJECT GUTENBERG EBOOK",
|
|
|
|
|
"*** END OF THIS PROJECT GUTENBERG EBOOK",
|
|
|
|
|
"***END OF THE PROJECT GUTENBERG EBOOK",
|
|
|
|
|
"End of the Project Gutenberg EBook",
|
|
|
|
|
"End of Project Gutenberg",
|
|
|
|
|
]:
|
|
|
|
|
idx = text.find(marker)
|
|
|
|
|
if idx != -1:
|
|
|
|
|
text = text[:idx]
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
return text.strip()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def normalize(text: str) -> str:
|
|
|
|
|
"""Normalize line endings."""
|
|
|
|
|
return text.replace("\r\n", "\n")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
# Per-source extractors
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
def extract_henry_viii(text: str) -> list[dict]:
|
|
|
|
|
text = normalize(strip_gutenberg(text))
|
|
|
|
|
parts = re.split(
|
|
|
|
|
r"\n{2,}(?=Letter\s+(?:First|Second|Third|Fourth|Fifth|Sixth|Seventh|"
|
|
|
|
|
r"Eighth|Ninth|Tenth|Eleventh|Twelfth|Thirteenth|Fourteenth|"
|
|
|
|
|
r"Fifteenth|Sixteenth|Seventeenth|Eighteenth)\b)",
|
|
|
|
|
text,
|
|
|
|
|
)
|
|
|
|
|
letters = []
|
|
|
|
|
for part in parts:
|
|
|
|
|
part = part.strip()
|
|
|
|
|
m = re.match(r"(Letter\s+\w+)(?:\s+.*?)?\n", part, re.IGNORECASE)
|
|
|
|
|
if not m or len(part) < 80:
|
|
|
|
|
continue
|
|
|
|
|
heading = m.group(1)
|
|
|
|
|
body = part[m.end():].strip()
|
|
|
|
|
for tag in ["\nNotes\n", "\nNOTES\n"]:
|
|
|
|
|
idx = body.find(tag)
|
|
|
|
|
if idx != -1:
|
|
|
|
|
body = body[:idx].strip()
|
|
|
|
|
author, recipient = "Henry VIII", "Anne Boleyn"
|
|
|
|
|
if "Boleyn to" in part[:200]:
|
|
|
|
|
author, recipient = "Anne Boleyn", "Cardinal Wolsey"
|
|
|
|
|
if len(body) > 50:
|
|
|
|
|
letters.append({
|
|
|
|
|
"heading": heading, "body": body,
|
|
|
|
|
"author": author, "recipient": recipient,
|
|
|
|
|
"source": "The Love Letters of Henry VIII to Anne Boleyn",
|
|
|
|
|
"period": "c. 1527–1528",
|
|
|
|
|
})
|
|
|
|
|
return letters
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def extract_wollstonecraft(text: str) -> list[dict]:
|
|
|
|
|
text = normalize(strip_gutenberg(text))
|
|
|
|
|
parts = re.split(r"\n{2,}(?=LETTER\s+[IVXLC0-9]+\.?\s*\n)", text, flags=re.IGNORECASE)
|
|
|
|
|
letters = []
|
|
|
|
|
for part in parts:
|
|
|
|
|
part = part.strip()
|
|
|
|
|
m = re.match(r"(LETTER\s+[IVXLC0-9]+\.?)\s*\n", part, re.IGNORECASE)
|
|
|
|
|
if not m or len(part) < 80:
|
|
|
|
|
continue
|
|
|
|
|
body = part[m.end():].strip()
|
|
|
|
|
if len(body) > 50:
|
|
|
|
|
letters.append({
|
|
|
|
|
"heading": m.group(1), "body": body,
|
|
|
|
|
"author": "Mary Wollstonecraft", "recipient": "Gilbert Imlay",
|
|
|
|
|
"source": "The Love Letters of Mary Wollstonecraft to Gilbert Imlay",
|
|
|
|
|
"period": "1793–1795",
|
|
|
|
|
})
|
|
|
|
|
return letters
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def extract_abelard_heloise(text: str) -> list[dict]:
|
|
|
|
|
text = normalize(strip_gutenberg(text))
|
|
|
|
|
parts = re.split(r"\n{2,}(?=LETTER\s+[IVXLC0-9]+[.:]?\s*\n)", text, flags=re.IGNORECASE)
|
|
|
|
|
letters = []
|
|
|
|
|
for part in parts:
|
|
|
|
|
part = part.strip()
|
|
|
|
|
m = re.match(r"(LETTER\s+[IVXLC0-9]+[.:]?)\s*\n", part, re.IGNORECASE)
|
|
|
|
|
if not m or len(part) < 120:
|
|
|
|
|
continue
|
|
|
|
|
body = part[m.end():].strip()
|
|
|
|
|
author, recipient = "Abelard & Heloise", "each other"
|
|
|
|
|
lower = body[:300].lower()
|
|
|
|
|
if "heloise to abelard" in lower:
|
|
|
|
|
author, recipient = "Heloise", "Abelard"
|
|
|
|
|
elif "abelard to heloise" in lower:
|
|
|
|
|
author, recipient = "Abelard", "Heloise"
|
|
|
|
|
if len(body) > 50:
|
|
|
|
|
letters.append({
|
|
|
|
|
"heading": m.group(1), "body": body,
|
|
|
|
|
"author": author, "recipient": recipient,
|
|
|
|
|
"source": "Letters of Abelard and Heloise",
|
|
|
|
|
"period": "12th century",
|
|
|
|
|
})
|
|
|
|
|
return letters
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def extract_napoleon(text: str) -> list[dict]:
|
|
|
|
|
text = normalize(strip_gutenberg(text))
|
|
|
|
|
parts = re.split(r"\n{2,}(?=No\.\s*\d+\.\s*\n)", text)
|
|
|
|
|
letters = []
|
|
|
|
|
for part in parts:
|
|
|
|
|
part = part.strip()
|
|
|
|
|
m = re.match(r"(No\.\s*\d+\.)\s*\n", part)
|
|
|
|
|
if not m or len(part) < 100:
|
|
|
|
|
continue
|
|
|
|
|
body = part[m.end():].strip()
|
|
|
|
|
if len(body) > 80:
|
|
|
|
|
letters.append({
|
|
|
|
|
"heading": m.group(1), "body": body,
|
|
|
|
|
"author": "Napoleon Bonaparte", "recipient": "Josephine",
|
|
|
|
|
"source": "Napoleon's Letters to Josephine, 1796–1812",
|
|
|
|
|
"period": "1796–1812",
|
|
|
|
|
})
|
|
|
|
|
return letters
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def extract_keats_brawne(text: str) -> list[dict]:
|
|
|
|
|
text = normalize(strip_gutenberg(text))
|
|
|
|
|
parts = re.split(r"\n{2,}(?=[IVXLC]+\.\s*\n)", text)
|
|
|
|
|
letters = []
|
|
|
|
|
for part in parts:
|
|
|
|
|
part = part.strip()
|
|
|
|
|
m = re.match(r"([IVXLC]+)\.\s*\n", part)
|
|
|
|
|
if not m or len(part) < 100:
|
|
|
|
|
continue
|
|
|
|
|
body = part[m.end():].strip()
|
|
|
|
|
if len(body) > 50:
|
|
|
|
|
letters.append({
|
|
|
|
|
"heading": f"Letter {m.group(1)}", "body": body,
|
|
|
|
|
"author": "John Keats", "recipient": "Fanny Brawne",
|
|
|
|
|
"source": "Letters of John Keats to Fanny Brawne",
|
|
|
|
|
"period": "1819–1820",
|
|
|
|
|
})
|
|
|
|
|
return letters
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _extract_browning(text: str, vol_label: str) -> list[dict]:
|
|
|
|
|
"""Extract letters from a Browning correspondence volume."""
|
|
|
|
|
text = normalize(strip_gutenberg(text))
|
|
|
|
|
# Split on _R.B. to E.B.B._ or _E.B.B. to R.B._
|
|
|
|
|
parts = re.split(r"\n{2,}(?=_(?:R\.B\. to E\.B\.B\.|E\.B\.B\. to R\.B\.)_)", text)
|
|
|
|
|
letters = []
|
|
|
|
|
for part in parts:
|
|
|
|
|
part = part.strip()
|
|
|
|
|
m = re.match(r"_(R\.B\. to E\.B\.B\.|E\.B\.B\. to R\.B\.)_\s*\n", part)
|
|
|
|
|
if not m or len(part) < 100:
|
|
|
|
|
continue
|
|
|
|
|
direction = m.group(1)
|
|
|
|
|
body = part[m.end():].strip()
|
|
|
|
|
if "R.B. to E.B.B." in direction:
|
|
|
|
|
author = "Robert Browning"
|
|
|
|
|
recipient = "Elizabeth Barrett Browning"
|
|
|
|
|
else:
|
|
|
|
|
author = "Elizabeth Barrett Browning"
|
|
|
|
|
recipient = "Robert Browning"
|
|
|
|
|
if len(body) > 50:
|
|
|
|
|
letters.append({
|
|
|
|
|
"heading": direction, "body": body,
|
|
|
|
|
"author": author, "recipient": recipient,
|
|
|
|
|
"source": f"The Letters of Robert Browning and Elizabeth Barrett Barrett, {vol_label}",
|
|
|
|
|
"period": "1845–1846",
|
|
|
|
|
})
|
|
|
|
|
return letters
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def extract_browning(text: str) -> list[dict]:
|
|
|
|
|
return _extract_browning(text, "Vol. 1")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def extract_browning_vol2(text: str) -> list[dict]:
|
|
|
|
|
return _extract_browning(text, "Vol. 2")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def extract_burns_clarinda(text: str) -> list[dict]:
|
|
|
|
|
text = normalize(strip_gutenberg(text))
|
|
|
|
|
# Find the "LETTERS TO CLARINDA" section
|
|
|
|
|
start_idx = text.find("LETTERS TO CLARINDA")
|
|
|
|
|
if start_idx == -1:
|
|
|
|
|
return []
|
|
|
|
|
section = text[start_idx:]
|
|
|
|
|
# End at next major section (all caps heading after blank lines)
|
|
|
|
|
end_match = re.search(r"\n{3,}[A-Z][A-Z ]{10,}\n", section[100:])
|
|
|
|
|
if end_match:
|
|
|
|
|
section = section[:100 + end_match.start()]
|
|
|
|
|
|
|
|
|
|
parts = re.split(r"\n{2,}(?=[IVXLC]+\.\s*\n)", section)
|
|
|
|
|
letters = []
|
|
|
|
|
for part in parts:
|
|
|
|
|
part = part.strip()
|
|
|
|
|
m = re.match(r"([IVXLC]+)\.\s*\n", part)
|
|
|
|
|
if not m or len(part) < 80:
|
|
|
|
|
continue
|
|
|
|
|
body = part[m.end():].strip()
|
|
|
|
|
if len(body) > 50:
|
|
|
|
|
letters.append({
|
|
|
|
|
"heading": f"Letter {m.group(1)}", "body": body,
|
|
|
|
|
"author": "Robert Burns", "recipient": "Clarinda (Agnes McLehose)",
|
|
|
|
|
"source": "Letters of Robert Burns to Clarinda",
|
|
|
|
|
"period": "1787–1794",
|
|
|
|
|
})
|
|
|
|
|
return letters
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def extract_dorothy_osborne(text: str) -> list[dict]:
|
|
|
|
|
text = normalize(strip_gutenberg(text))
|
|
|
|
|
# Letters start with "SIR,--" after editorial commentary
|
|
|
|
|
# Split by looking backwards from each "SIR,--" to find the date/heading
|
|
|
|
|
letters = []
|
|
|
|
|
# Find all "SIR,--" occurrences
|
|
|
|
|
sir_positions = [m.start() for m in re.finditer(r"^SIR,--", text, re.MULTILINE)]
|
|
|
|
|
|
|
|
|
|
for i, pos in enumerate(sir_positions):
|
|
|
|
|
# Look for a date line just before the salutation
|
|
|
|
|
preceding = text[max(0, pos - 200):pos]
|
|
|
|
|
date_match = re.search(r"\n\n_([^_]+)_\.?\s*\n\s*$", preceding)
|
|
|
|
|
heading = date_match.group(1).strip() if date_match else ""
|
|
|
|
|
|
|
|
|
|
# Letter body extends to the next editorial section or next SIR
|
|
|
|
|
if i + 1 < len(sir_positions):
|
|
|
|
|
end = sir_positions[i + 1]
|
|
|
|
|
# Try to find where editorial notes begin (usually with _Letter)
|
|
|
|
|
editorial = re.search(r"\n_Letter\s+[IVXLC]+\._", text[pos:end])
|
|
|
|
|
if editorial:
|
|
|
|
|
end = pos + editorial.start()
|
|
|
|
|
else:
|
|
|
|
|
end = len(text)
|
|
|
|
|
|
|
|
|
|
body = text[pos:end].strip()
|
|
|
|
|
# Trim trailing editorial notes (paragraphs starting with special patterns)
|
|
|
|
|
body = re.split(r"\n\n(?=_[A-Z])", body)[0].strip()
|
|
|
|
|
|
|
|
|
|
if len(body) > 80:
|
|
|
|
|
letters.append({
|
|
|
|
|
"heading": heading if heading else f"Letter {i + 1}",
|
|
|
|
|
"body": body,
|
|
|
|
|
"author": "Dorothy Osborne", "recipient": "Sir William Temple",
|
|
|
|
|
"source": "The Love Letters of Dorothy Osborne to Sir William Temple",
|
|
|
|
|
"period": "1652–1654",
|
|
|
|
|
})
|
|
|
|
|
return letters
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def extract_beethoven(text: str) -> list[dict]:
|
|
|
|
|
text = normalize(strip_gutenberg(text))
|
|
|
|
|
# Letters are headed "N.\n\nTO ..." where N is a number
|
|
|
|
|
parts = re.split(r"\n{2,}(?=\d+\.\s*\n\s*\nTO\s)", text)
|
|
|
|
|
letters = []
|
|
|
|
|
# Keywords to identify love/romantic letters
|
|
|
|
|
love_keywords = [
|
|
|
|
|
"immortal beloved", "my angel", "my love", "beloved",
|
|
|
|
|
"my heart", "kiss", "embrace", "love you",
|
|
|
|
|
"sweetheart", "giulietta", "guicciardi",
|
|
|
|
|
"josephine brunsvik", "bettina", "brentano",
|
|
|
|
|
"amalie sebald", "my all", "my second self",
|
|
|
|
|
"ardently", "passionately", "tenderly yours",
|
|
|
|
|
]
|
|
|
|
|
for part in parts:
|
|
|
|
|
part = part.strip()
|
|
|
|
|
m = re.match(r"(\d+)\.\s*\n\s*\n(TO\s+.+?)(?:\n|$)", part)
|
|
|
|
|
if not m or len(part) < 100:
|
|
|
|
|
continue
|
|
|
|
|
num = m.group(1)
|
|
|
|
|
to_line = m.group(2).strip()
|
|
|
|
|
body = part[m.end():].strip()
|
|
|
|
|
full_text = (to_line + " " + body).lower()
|
|
|
|
|
# Only include letters with romantic content
|
|
|
|
|
if any(kw in full_text for kw in love_keywords):
|
|
|
|
|
letters.append({
|
|
|
|
|
"heading": f"No. {num} — {to_line}",
|
|
|
|
|
"body": body,
|
|
|
|
|
"author": "Ludwig van Beethoven",
|
|
|
|
|
"recipient": to_line.replace("TO ", "").strip("."),
|
|
|
|
|
"source": "Beethoven's Letters 1790–1826",
|
|
|
|
|
"period": "1790–1826",
|
|
|
|
|
})
|
|
|
|
|
return letters
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def extract_mozart(text: str) -> list[dict]:
|
|
|
|
|
text = normalize(strip_gutenberg(text))
|
|
|
|
|
# Mozart's letters use numbered sections
|
|
|
|
|
parts = re.split(r"\n{2,}(?=\d+\.\s*\n)", text)
|
|
|
|
|
letters = []
|
|
|
|
|
love_keywords = [
|
|
|
|
|
"my love", "kiss", "beloved", "my heart",
|
|
|
|
|
"my dear wife", "constanze", "my darling",
|
|
|
|
|
"embrace you", "tender", "passionately",
|
|
|
|
|
"aloysia", "my dearest wife",
|
|
|
|
|
]
|
|
|
|
|
for part in parts:
|
|
|
|
|
part = part.strip()
|
|
|
|
|
m = re.match(r"(\d+)\.\s*\n", part)
|
|
|
|
|
if not m or len(part) < 100:
|
|
|
|
|
continue
|
|
|
|
|
num = m.group(1)
|
|
|
|
|
body = part[m.end():].strip()
|
|
|
|
|
# Extract TO line if present
|
|
|
|
|
to_match = re.match(r"(TO\s+.+?)(?:\n|$)", body)
|
|
|
|
|
to_line = to_match.group(1).strip() if to_match else ""
|
|
|
|
|
full_text = body.lower()
|
|
|
|
|
if any(kw in full_text for kw in love_keywords):
|
|
|
|
|
recipient = to_line.replace("TO ", "").strip(".") if to_line else "Constanze Mozart"
|
|
|
|
|
letters.append({
|
|
|
|
|
"heading": f"No. {num}" + (f" — {to_line}" if to_line else ""),
|
|
|
|
|
"body": body,
|
|
|
|
|
"author": "Wolfgang Amadeus Mozart",
|
|
|
|
|
"recipient": recipient,
|
|
|
|
|
"source": "The Letters of Wolfgang Amadeus Mozart",
|
|
|
|
|
"period": "1769–1791",
|
|
|
|
|
})
|
|
|
|
|
return letters
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
EXTRACTORS = {
|
|
|
|
|
"henry_viii": extract_henry_viii,
|
|
|
|
|
"wollstonecraft": extract_wollstonecraft,
|
|
|
|
|
"abelard_heloise": extract_abelard_heloise,
|
|
|
|
|
"napoleon": extract_napoleon,
|
|
|
|
|
"keats_brawne": extract_keats_brawne,
|
|
|
|
|
"browning": extract_browning,
|
|
|
|
|
"browning_vol2": extract_browning_vol2,
|
|
|
|
|
"burns_clarinda": extract_burns_clarinda,
|
|
|
|
|
"dorothy_osborne": extract_dorothy_osborne,
|
|
|
|
|
"beethoven": extract_beethoven,
|
|
|
|
|
"mozart": extract_mozart,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
# Main
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
def download_source(source: dict, force: bool = False) -> int:
|
|
|
|
|
"""Download, parse, and save letters for one source. Returns letter count."""
|
|
|
|
|
out_path = os.path.join(LETTERS_DIR, f"{source['id']}.json")
|
|
|
|
|
if os.path.exists(out_path) and not force:
|
|
|
|
|
existing = json.load(open(out_path, "r", encoding="utf-8"))
|
|
|
|
|
return len(existing)
|
|
|
|
|
|
|
|
|
|
print(f" ⬇ Downloading: {source['title']}…", flush=True)
|
|
|
|
|
try:
|
|
|
|
|
raw = download_text(source["url"])
|
|
|
|
|
except Exception as e:
|
|
|
|
|
print(f" ⚠ Failed: {e}")
|
|
|
|
|
return 0
|
|
|
|
|
|
|
|
|
|
extractor = EXTRACTORS.get(source["id"])
|
|
|
|
|
if extractor is None:
|
|
|
|
|
print(f" ⚠ No extractor for {source['id']}")
|
|
|
|
|
return 0
|
|
|
|
|
|
|
|
|
|
letters = extractor(raw)
|
|
|
|
|
if not letters:
|
|
|
|
|
print(f" ⚠ No letters extracted from {source['title']}")
|
|
|
|
|
return 0
|
|
|
|
|
|
|
|
|
|
os.makedirs(LETTERS_DIR, exist_ok=True)
|
|
|
|
|
with open(out_path, "w", encoding="utf-8") as f:
|
|
|
|
|
json.dump(letters, f, ensure_ascii=False, indent=2)
|
|
|
|
|
|
|
|
|
|
print(f" ✓ {len(letters)} letters saved → letters/{source['id']}.json")
|
|
|
|
|
return len(letters)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def main() -> None:
|
|
|
|
|
import argparse
|
|
|
|
|
parser = argparse.ArgumentParser(description="Download love letters from Project Gutenberg.")
|
|
|
|
|
parser.add_argument("--force", action="store_true", help="re-download all sources")
|
|
|
|
|
parser.add_argument("--list", action="store_true", help="list available sources")
|
|
|
|
|
args = parser.parse_args()
|
|
|
|
|
|
|
|
|
|
if args.list:
|
|
|
|
|
print("\n Available sources:\n")
|
|
|
|
|
for i, src in enumerate(SOURCES, 1):
|
|
|
|
|
print(f" {i:2}. [{src['id']}] {src['title']}")
|
|
|
|
|
print(f" {src['author']} → {src['recipient']} ({src['year']})")
|
|
|
|
|
print(f" gutenberg.org/ebooks/{src['gutenberg_id']}")
|
|
|
|
|
print()
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
print("\n 📥 Downloading love letters from Project Gutenberg…\n")
|
|
|
|
|
total = 0
|
|
|
|
|
for source in SOURCES:
|
|
|
|
|
count = download_source(source, force=args.force)
|
|
|
|
|
total += count
|
|
|
|
|
|
|
|
|
|
print(f"\n 📬 Total: {total} letters in {LETTERS_DIR}/\n")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
main()
|