|
|
#!/usr/bin/env python3
|
|
|
"""Download and parse poetry collections from Project Gutenberg.
|
|
|
|
|
|
This is a maintainer-only tool. End users should use the pre-downloaded
|
|
|
poetry files in the poetry/ directory.
|
|
|
|
|
|
Usage:
|
|
|
python download_poetry.py # Download all sources
|
|
|
python download_poetry.py --list # List available sources
|
|
|
"""
|
|
|
|
|
|
import json
|
|
|
import os
|
|
|
import re
|
|
|
import sys
|
|
|
import urllib.request
|
|
|
|
|
|
POETRY_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "poetry")
|
|
|
GUTENBERG_URL = "https://www.gutenberg.org/cache/epub/{id}/pg{id}.txt"
|
|
|
|
|
|
|
|
|
def fetch_text(gutenberg_id):
|
|
|
"""Download and return cleaned text from Project Gutenberg."""
|
|
|
url = GUTENBERG_URL.format(id=gutenberg_id)
|
|
|
print(f" Downloading {url} ...")
|
|
|
req = urllib.request.Request(url, headers={"User-Agent": "PoetryDownloader/1.0"})
|
|
|
with urllib.request.urlopen(req, timeout=30) as resp:
|
|
|
raw = resp.read().decode("utf-8-sig")
|
|
|
# Normalize line endings
|
|
|
raw = raw.replace("\r\n", "\n").replace("\r", "\n")
|
|
|
return raw
|
|
|
|
|
|
|
|
|
def extract_body(text):
|
|
|
"""Extract text between Gutenberg START/END markers."""
|
|
|
start = re.search(r"\*\*\*\s*START OF.*?\*\*\*", text)
|
|
|
end = re.search(r"\*\*\*\s*END OF.*?\*\*\*", text)
|
|
|
if start and end:
|
|
|
return text[start.end():end.start()]
|
|
|
if start:
|
|
|
return text[start.end():]
|
|
|
return text
|
|
|
|
|
|
|
|
|
def clean_poem(text):
|
|
|
"""Clean whitespace from a poem body."""
|
|
|
lines = text.split("\n")
|
|
|
# Strip trailing whitespace from each line
|
|
|
lines = [l.rstrip() for l in lines]
|
|
|
# Remove leading/trailing blank lines
|
|
|
while lines and not lines[0].strip():
|
|
|
lines.pop(0)
|
|
|
while lines and not lines[-1].strip():
|
|
|
lines.pop()
|
|
|
return "\n".join(lines)
|
|
|
|
|
|
|
|
|
def make_poem(title, body, author, source, period):
|
|
|
"""Create a poem dict, skipping empty poems."""
|
|
|
body = clean_poem(body)
|
|
|
if not body or len(body.strip()) < 20:
|
|
|
return None
|
|
|
return {
|
|
|
"title": title.strip(),
|
|
|
"body": body,
|
|
|
"author": author,
|
|
|
"source": source,
|
|
|
"period": period,
|
|
|
}
|
|
|
|
|
|
|
|
|
# ─── Extractors ─────────────────────────────────────────────────
|
|
|
|
|
|
def extract_shakespeare_sonnets(text):
|
|
|
"""Shakespeare's Sonnets (Gutenberg 1041)."""
|
|
|
body = extract_body(text)
|
|
|
poems = []
|
|
|
# Sonnets are separated by Roman numeral on its own line with double newlines
|
|
|
parts = re.split(r"\n\n([IVXLC]+)\n\n", body)
|
|
|
# parts[0] is intro, then alternating: numeral, body
|
|
|
for i in range(1, len(parts) - 1, 2):
|
|
|
numeral = parts[i].strip()
|
|
|
poem_body = parts[i + 1]
|
|
|
p = make_poem(
|
|
|
f"Sonnet {numeral}",
|
|
|
poem_body,
|
|
|
"William Shakespeare",
|
|
|
"Shakespeare's Sonnets",
|
|
|
"1609",
|
|
|
)
|
|
|
if p:
|
|
|
poems.append(p)
|
|
|
return poems
|
|
|
|
|
|
|
|
|
def extract_dickinson(text):
|
|
|
"""Emily Dickinson's Poems, Three Series (Gutenberg 12242)."""
|
|
|
body = extract_body(text)
|
|
|
poems = []
|
|
|
|
|
|
# Skip the preface - find first poem section
|
|
|
# Poems are grouped by topic headings (e.g., "I. LIFE.", "II. LOVE.")
|
|
|
# Individual poems have titles in ALL CAPS or Roman numerals
|
|
|
|
|
|
# Split into series sections
|
|
|
# Each poem is preceded by a blank line and a title line (usually short, often ALL CAPS)
|
|
|
# The structure is: section headings, then poems with short titles
|
|
|
|
|
|
# Strategy: find patterns like title lines followed by poem bodies
|
|
|
lines = body.split("\n")
|
|
|
|
|
|
current_title = ""
|
|
|
current_body_lines = []
|
|
|
in_poem = False
|
|
|
section = ""
|
|
|
|
|
|
# Skip to after TRANSCRIBER'S NOTE section and the first poem
|
|
|
start_idx = 0
|
|
|
for i, line in enumerate(lines):
|
|
|
if line.strip() == "I. LIFE.":
|
|
|
start_idx = i
|
|
|
break
|
|
|
|
|
|
i = start_idx
|
|
|
while i < len(lines):
|
|
|
line = lines[i]
|
|
|
stripped = line.strip()
|
|
|
|
|
|
# Section headers like "I. LIFE.", "II. LOVE.", etc.
|
|
|
if re.match(r"^[IVX]+\.\s+[A-Z]+\.\s*$", stripped):
|
|
|
# Save previous poem
|
|
|
if current_title and current_body_lines:
|
|
|
p = make_poem(
|
|
|
current_title,
|
|
|
"\n".join(current_body_lines),
|
|
|
"Emily Dickinson",
|
|
|
"Poems by Emily Dickinson, Three Series",
|
|
|
"1890–1896",
|
|
|
)
|
|
|
if p:
|
|
|
poems.append(p)
|
|
|
current_title = ""
|
|
|
current_body_lines = []
|
|
|
section = stripped
|
|
|
i += 1
|
|
|
continue
|
|
|
|
|
|
# Series dividers
|
|
|
if "SERIES" in stripped and "SECOND" in stripped or "THIRD" in stripped:
|
|
|
if current_title and current_body_lines:
|
|
|
p = make_poem(
|
|
|
current_title,
|
|
|
"\n".join(current_body_lines),
|
|
|
"Emily Dickinson",
|
|
|
"Poems by Emily Dickinson, Three Series",
|
|
|
"1890–1896",
|
|
|
)
|
|
|
if p:
|
|
|
poems.append(p)
|
|
|
current_title = ""
|
|
|
current_body_lines = []
|
|
|
i += 1
|
|
|
continue
|
|
|
|
|
|
# Poem title: short line, mostly caps, after blank line
|
|
|
# Dickinson titles are typically short phrases in ALL CAPS
|
|
|
if (stripped and
|
|
|
len(stripped) < 60 and
|
|
|
not stripped.startswith("[") and
|
|
|
stripped == stripped.upper() and
|
|
|
re.search(r"[A-Z]{2,}", stripped) and
|
|
|
not re.match(r"^[IVX]+\.\s+[A-Z]+\.\s*$", stripped) and
|
|
|
i > 0 and not lines[i-1].strip()):
|
|
|
|
|
|
# Looks like a title - save previous poem
|
|
|
if current_title and current_body_lines:
|
|
|
p = make_poem(
|
|
|
current_title,
|
|
|
"\n".join(current_body_lines),
|
|
|
"Emily Dickinson",
|
|
|
"Poems by Emily Dickinson, Three Series",
|
|
|
"1890–1896",
|
|
|
)
|
|
|
if p:
|
|
|
poems.append(p)
|
|
|
|
|
|
current_title = stripped.title()
|
|
|
current_body_lines = []
|
|
|
in_poem = True
|
|
|
i += 1
|
|
|
continue
|
|
|
|
|
|
if in_poem:
|
|
|
current_body_lines.append(line)
|
|
|
|
|
|
i += 1
|
|
|
|
|
|
# Don't forget last poem
|
|
|
if current_title and current_body_lines:
|
|
|
p = make_poem(
|
|
|
current_title,
|
|
|
"\n".join(current_body_lines),
|
|
|
"Emily Dickinson",
|
|
|
"Poems by Emily Dickinson, Three Series",
|
|
|
"1890–1896",
|
|
|
)
|
|
|
if p:
|
|
|
poems.append(p)
|
|
|
|
|
|
return poems
|
|
|
|
|
|
|
|
|
def extract_whitman(text):
|
|
|
"""Walt Whitman's Leaves of Grass (Gutenberg 1322)."""
|
|
|
body = extract_body(text)
|
|
|
poems = []
|
|
|
|
|
|
# Poems have titles on their own lines, followed by poem text
|
|
|
# Some are prefixed with "BOOK I." etc.
|
|
|
# Title lines are typically short, not indented much
|
|
|
lines = body.split("\n")
|
|
|
|
|
|
# Skip the initial inscription
|
|
|
start_idx = 0
|
|
|
for i, line in enumerate(lines):
|
|
|
if "One's-Self I Sing" in line:
|
|
|
start_idx = i
|
|
|
break
|
|
|
|
|
|
current_title = ""
|
|
|
current_body_lines = []
|
|
|
|
|
|
# Whitman's poems: title is a non-indented, relatively short line
|
|
|
# followed by indented poem body
|
|
|
# Some titles span multiple lines (rare)
|
|
|
i = start_idx
|
|
|
while i < len(lines):
|
|
|
line = lines[i]
|
|
|
stripped = line.strip()
|
|
|
|
|
|
# Skip BOOK headers
|
|
|
if re.match(r"^BOOK\s+[IVXLC]+", stripped):
|
|
|
i += 1
|
|
|
continue
|
|
|
|
|
|
# Title detection: non-blank, mostly un-indented, short-ish
|
|
|
# Titles in Whitman are like "Song of Myself", "I Sing the Body Electric"
|
|
|
# They are followed by blank line then indented body
|
|
|
if (stripped and
|
|
|
len(stripped) < 80 and
|
|
|
not line.startswith(" ") and
|
|
|
not stripped.startswith("[") and
|
|
|
not stripped.startswith("BOOK") and
|
|
|
i > 0 and not lines[i-1].strip()):
|
|
|
|
|
|
# Check if next non-blank line is indented (poem body)
|
|
|
j = i + 1
|
|
|
while j < len(lines) and not lines[j].strip():
|
|
|
j += 1
|
|
|
if j < len(lines) and lines[j].startswith(" "):
|
|
|
# This is a title
|
|
|
if current_title and current_body_lines:
|
|
|
p = make_poem(
|
|
|
current_title,
|
|
|
"\n".join(current_body_lines),
|
|
|
"Walt Whitman",
|
|
|
"Leaves of Grass",
|
|
|
"1891–1892",
|
|
|
)
|
|
|
if p:
|
|
|
poems.append(p)
|
|
|
current_title = stripped
|
|
|
current_body_lines = []
|
|
|
i += 1
|
|
|
continue
|
|
|
|
|
|
if current_title:
|
|
|
current_body_lines.append(line)
|
|
|
|
|
|
i += 1
|
|
|
|
|
|
if current_title and current_body_lines:
|
|
|
p = make_poem(
|
|
|
current_title,
|
|
|
"\n".join(current_body_lines),
|
|
|
"Walt Whitman",
|
|
|
"Leaves of Grass",
|
|
|
"1891–1892",
|
|
|
)
|
|
|
if p:
|
|
|
poems.append(p)
|
|
|
|
|
|
return poems
|
|
|
|
|
|
|
|
|
def extract_blake(text):
|
|
|
"""William Blake's Songs of Innocence and of Experience (Gutenberg 1934)."""
|
|
|
body = extract_body(text)
|
|
|
poems = []
|
|
|
|
|
|
# Remove [Picture: ...] tags
|
|
|
body = re.sub(r"\[Picture:.*?\]", "", body, flags=re.DOTALL)
|
|
|
|
|
|
# Find all poem titles (ALL CAPS on their own line, 3+ newlines before)
|
|
|
# Build a list of (title, start_pos) then extract bodies between them
|
|
|
title_pattern = re.compile(r"\n\n\n([A-Z][A-Z ,'!?:;\-—]+)\n")
|
|
|
matches = list(title_pattern.finditer(body))
|
|
|
|
|
|
skip_titles = {"SONGS OF INNOCENCE", "SONGS OF EXPERIENCE", "CONTENTS",
|
|
|
"SONGS OF INNOCENCE AND OF EXPERIENCE"}
|
|
|
|
|
|
for idx, m in enumerate(matches):
|
|
|
title = m.group(1).strip()
|
|
|
if title in skip_titles:
|
|
|
continue
|
|
|
start = m.end()
|
|
|
end = matches[idx + 1].start() if idx + 1 < len(matches) else len(body)
|
|
|
poem_body = body[start:end]
|
|
|
|
|
|
p = make_poem(
|
|
|
title.title(),
|
|
|
poem_body,
|
|
|
"William Blake",
|
|
|
"Songs of Innocence and of Experience",
|
|
|
"1789–1794",
|
|
|
)
|
|
|
if p:
|
|
|
poems.append(p)
|
|
|
|
|
|
return poems
|
|
|
|
|
|
|
|
|
def extract_keats(text):
|
|
|
"""Keats: Poems Published in 1820 (Gutenberg 23684)."""
|
|
|
body = extract_body(text)
|
|
|
poems = []
|
|
|
|
|
|
# Extract the main poems using CONTENTS as a guide
|
|
|
# The poems are: LAMIA (Part I & II), ISABELLA, THE EVE OF ST. AGNES,
|
|
|
# various Odes, FANCY, HYPERION (Books I-III), etc.
|
|
|
# Followed by NOTES sections
|
|
|
|
|
|
# Find where actual poems start (after the CONTENTS and LIFE OF KEATS sections)
|
|
|
advert_match = re.search(r"\nADVERTISEMENT\.?\n", body)
|
|
|
if advert_match:
|
|
|
body = body[advert_match.start():]
|
|
|
|
|
|
# Split on major poem titles - these appear in ALL CAPS on their own
|
|
|
# We'll manually identify poem boundaries
|
|
|
poem_titles = [
|
|
|
("ADVERTISEMENT", "LAMIA"),
|
|
|
("LAMIA. PART I", "LAMIA. PART II"),
|
|
|
("LAMIA. PART II", "ISABELLA"),
|
|
|
("ISABELLA; OR, THE POT OF BASIL", "THE EVE OF ST. AGNES"),
|
|
|
("THE EVE OF ST. AGNES", "ODE TO A NIGHTINGALE"),
|
|
|
("ODE TO A NIGHTINGALE", "ODE ON A GRECIAN URN"),
|
|
|
("ODE ON A GRECIAN URN", "ODE TO PSYCHE"),
|
|
|
("ODE TO PSYCHE", "FANCY"),
|
|
|
("FANCY", "ODE"),
|
|
|
("ODE\n", "LINES ON THE MERMAID TAVERN"),
|
|
|
("LINES ON THE MERMAID TAVERN", "ROBIN HOOD"),
|
|
|
("ROBIN HOOD", "TO AUTUMN"),
|
|
|
("TO AUTUMN", "ODE ON MELANCHOLY"),
|
|
|
("ODE ON MELANCHOLY", "HYPERION"),
|
|
|
("HYPERION. BOOK I", "HYPERION. BOOK II"),
|
|
|
("HYPERION. BOOK II", "HYPERION. BOOK III"),
|
|
|
("HYPERION. BOOK III", "NOTE ON ADVERTISEMENT"),
|
|
|
]
|
|
|
|
|
|
for title_start, title_end in poem_titles:
|
|
|
start = body.find(title_start)
|
|
|
end = body.find(title_end, start + len(title_start)) if title_end else len(body)
|
|
|
if start == -1:
|
|
|
continue
|
|
|
if end == -1:
|
|
|
end = len(body)
|
|
|
|
|
|
section = body[start:end]
|
|
|
# Remove the title line itself
|
|
|
first_newline = section.find("\n")
|
|
|
if first_newline != -1:
|
|
|
poem_body = section[first_newline:]
|
|
|
else:
|
|
|
poem_body = section
|
|
|
|
|
|
# Clean title
|
|
|
display_title = title_start.replace(". PART ", ", Part ").title()
|
|
|
if display_title == "Advertisement":
|
|
|
continue
|
|
|
if display_title.startswith("Ode\n"):
|
|
|
display_title = "Ode (Bards of Passion and of Mirth)"
|
|
|
|
|
|
# Remove editorial line numbers
|
|
|
poem_body = re.sub(r"\s+\d+$", "", poem_body, flags=re.MULTILINE)
|
|
|
# Remove footnote markers
|
|
|
poem_body = re.sub(r"\[\d+\]", "", poem_body)
|
|
|
|
|
|
p = make_poem(
|
|
|
display_title,
|
|
|
poem_body,
|
|
|
"John Keats",
|
|
|
"Poems Published in 1820",
|
|
|
"1820",
|
|
|
)
|
|
|
if p:
|
|
|
poems.append(p)
|
|
|
|
|
|
return poems
|
|
|
|
|
|
|
|
|
def extract_poe(text):
|
|
|
"""Complete Poetical Works of Edgar Allan Poe (Gutenberg 10031)."""
|
|
|
body = extract_body(text)
|
|
|
poems = []
|
|
|
lines_list = body.split("\n")
|
|
|
|
|
|
# Top-level sections containing actual poems
|
|
|
poem_sections = {"POEMS OF LATER LIFE", "POEMS OF MANHOOD",
|
|
|
"POEMS OF YOUTH", "DOUBTFUL POEMS"}
|
|
|
# Top-level sections that are NOT poems
|
|
|
non_poem_sections = {"CONTENTS", "TABLE OF CONTENTS", "NOTES", "NOTE",
|
|
|
"MEMOIR", "MEMOIR OF EDGAR ALLAN POE",
|
|
|
"PROSE POEMS", "ESSAYS", "NOTE ON POLITIAN",
|
|
|
'SCENES FROM "POLITIAN"'}
|
|
|
# Sub-headings within poem sections to skip
|
|
|
skip_titles = {"PREFACE", "INTRODUCTION TO POEMS1831",
|
|
|
"LETTER TO MR B", "JOHN H INGRAM",
|
|
|
"THE NOBLEST OF HER SEX",
|
|
|
"MISS ELIZABETH BARRETT BARRETT",
|
|
|
"OF ENGLAND", "I DEDICATE THIS VOLUME",
|
|
|
"WITH THE MOST ENTHUSIASTIC ADMIRATION AND",
|
|
|
"WEST POINT 1831", "DEAR B"}
|
|
|
|
|
|
in_poem_section = False
|
|
|
current_title = ""
|
|
|
current_body_lines = []
|
|
|
|
|
|
def _is_title(stripped, idx):
|
|
|
return (stripped and
|
|
|
len(stripped) < 60 and
|
|
|
stripped == stripped.upper() and
|
|
|
re.search(r"[A-Z]{3,}", stripped) and
|
|
|
not stripped.startswith("[") and
|
|
|
not stripped.startswith("BY ") and
|
|
|
not re.match(r"^\d+$", stripped) and
|
|
|
not re.match(r"^\d+[.\s]", stripped) and
|
|
|
not re.match(r"^[IVXLC]+\.$", stripped) and
|
|
|
not stripped.startswith("***") and
|
|
|
idx > 0 and not lines_list[idx - 1].strip())
|
|
|
|
|
|
def _save_current():
|
|
|
if current_title and current_body_lines:
|
|
|
body_text = "\n".join(current_body_lines)
|
|
|
cleaned = body_text.strip()
|
|
|
if (not re.match(r"^[\s*]+$", cleaned)
|
|
|
and len(cleaned) >= 50):
|
|
|
p = make_poem(current_title, body_text,
|
|
|
"Edgar Allan Poe",
|
|
|
"Complete Poetical Works of Edgar Allan Poe",
|
|
|
"1827\u20131849")
|
|
|
if p:
|
|
|
poems.append(p)
|
|
|
|
|
|
def _norm(s):
|
|
|
return re.sub(r"[.,:;\"\'-]", "", s).strip()
|
|
|
|
|
|
i = 0
|
|
|
while i < len(lines_list):
|
|
|
line = lines_list[i]
|
|
|
stripped = line.strip()
|
|
|
|
|
|
if _is_title(stripped, i):
|
|
|
clean = stripped.rstrip(".,:;").rstrip()
|
|
|
normed = _norm(stripped)
|
|
|
|
|
|
# Check for poem section header
|
|
|
if clean in poem_sections or normed in {_norm(s) for s in poem_sections}:
|
|
|
_save_current()
|
|
|
in_poem_section = True
|
|
|
current_title = ""
|
|
|
current_body_lines = []
|
|
|
i += 1
|
|
|
continue
|
|
|
|
|
|
# Check for non-poem section header
|
|
|
if clean in non_poem_sections or normed in {_norm(s) for s in non_poem_sections}:
|
|
|
_save_current()
|
|
|
in_poem_section = False
|
|
|
current_title = ""
|
|
|
current_body_lines = []
|
|
|
i += 1
|
|
|
continue
|
|
|
|
|
|
# Within a poem section
|
|
|
if in_poem_section:
|
|
|
# Skip certain sub-headings without starting a poem
|
|
|
if normed in {_norm(s) for s in skip_titles}:
|
|
|
_save_current()
|
|
|
current_title = ""
|
|
|
current_body_lines = []
|
|
|
i += 1
|
|
|
continue
|
|
|
|
|
|
# New poem title
|
|
|
_save_current()
|
|
|
title = stripped.title()
|
|
|
# Give "Part I" / "Part Ii" proper names
|
|
|
if re.match(r"Part [Ii]+\.", title):
|
|
|
title = "Al Aaraaf — " + title
|
|
|
current_title = title
|
|
|
current_body_lines = []
|
|
|
i += 1
|
|
|
continue
|
|
|
|
|
|
if in_poem_section and current_title:
|
|
|
current_body_lines.append(line)
|
|
|
|
|
|
i += 1
|
|
|
|
|
|
_save_current()
|
|
|
return poems
|
|
|
|
|
|
|
|
|
def extract_browning_sonnets(text):
|
|
|
"""Sonnets from the Portuguese by Elizabeth Barrett Browning (Gutenberg 2002)."""
|
|
|
body = extract_body(text)
|
|
|
poems = []
|
|
|
|
|
|
# Split on Roman numeral headers
|
|
|
parts = re.split(r"\n\n\n+([IVXLC]+)\n\n\n", body)
|
|
|
|
|
|
for i in range(1, len(parts) - 1, 2):
|
|
|
numeral = parts[i].strip()
|
|
|
poem_body = parts[i + 1]
|
|
|
p = make_poem(
|
|
|
f"Sonnet {numeral}",
|
|
|
poem_body,
|
|
|
"Elizabeth Barrett Browning",
|
|
|
"Sonnets from the Portuguese",
|
|
|
"1850",
|
|
|
)
|
|
|
if p:
|
|
|
poems.append(p)
|
|
|
|
|
|
return poems
|
|
|
|
|
|
|
|
|
def extract_eliot_wasteland(text):
|
|
|
"""The Waste Land by T.S. Eliot (Gutenberg 1321)."""
|
|
|
body = extract_body(text)
|
|
|
poems = []
|
|
|
|
|
|
section_defs = [
|
|
|
("I", "THE BURIAL OF THE DEAD"),
|
|
|
("II", "A GAME OF CHESS"),
|
|
|
("III", "THE FIRE SERMON"),
|
|
|
("IV", "DEATH BY WATER"),
|
|
|
("V", "WHAT THE THUNDER SAID"),
|
|
|
]
|
|
|
|
|
|
# Find each section's actual occurrence (not CONTENTS/NOTES)
|
|
|
# The actual poem sections have the text on a non-indented line followed by poem body
|
|
|
positions = []
|
|
|
for num, title in section_defs:
|
|
|
pattern = re.compile(
|
|
|
r"^\s+" + re.escape(num) + r"\.\s+" + re.escape(title) + r"\s*$",
|
|
|
re.MULTILINE,
|
|
|
)
|
|
|
matches = list(pattern.finditer(body))
|
|
|
# Use the second occurrence (first is CONTENTS)
|
|
|
if len(matches) >= 2:
|
|
|
positions.append((f"The Waste Land: {title.title()}", matches[1].start(), matches[1].end()))
|
|
|
elif matches:
|
|
|
positions.append((f"The Waste Land: {title.title()}", matches[0].start(), matches[0].end()))
|
|
|
|
|
|
# Find NOTES section (second occurrence, after the poem)
|
|
|
notes_matches = list(re.finditer(r"NOTES ON", body))
|
|
|
notes_pos = notes_matches[1].start() if len(notes_matches) >= 2 else (
|
|
|
notes_matches[0].start() if notes_matches else len(body))
|
|
|
|
|
|
for i, (title, sec_start, sec_text_start) in enumerate(positions):
|
|
|
end = positions[i + 1][1] if i + 1 < len(positions) else notes_pos
|
|
|
section = body[sec_text_start:end]
|
|
|
# Remove line numbers
|
|
|
section = re.sub(r"\s+\d+$", "", section, flags=re.MULTILINE)
|
|
|
|
|
|
p = make_poem(
|
|
|
title,
|
|
|
section,
|
|
|
"T.S. Eliot",
|
|
|
"The Waste Land",
|
|
|
"1922",
|
|
|
)
|
|
|
if p:
|
|
|
poems.append(p)
|
|
|
|
|
|
return poems
|
|
|
|
|
|
|
|
|
def extract_frost_mountain(text):
|
|
|
"""Robert Frost's Mountain Interval (Gutenberg 29345)."""
|
|
|
body = extract_body(text)
|
|
|
poems = []
|
|
|
|
|
|
# Poems have titles in _italics_ markup or ALL CAPS
|
|
|
# Pattern: _TITLE_ or _Title_ on own line
|
|
|
lines = body.split("\n")
|
|
|
|
|
|
# Find start of poems (after CONTENTS)
|
|
|
start_idx = 0
|
|
|
for i, line in enumerate(lines):
|
|
|
if "_THE ROAD NOT TAKEN_" in line or "THE ROAD NOT TAKEN" in line:
|
|
|
start_idx = i
|
|
|
break
|
|
|
|
|
|
current_title = ""
|
|
|
current_body_lines = []
|
|
|
|
|
|
i = start_idx
|
|
|
while i < len(lines):
|
|
|
line = lines[i]
|
|
|
stripped = line.strip()
|
|
|
|
|
|
# Title pattern: _TITLE_ or just an ALL CAPS short line
|
|
|
title_match = re.match(r"^_([A-Z][A-Z ,'!?:\-.—\"]+)_$", stripped)
|
|
|
if not title_match:
|
|
|
# Also match non-italic titles
|
|
|
if (stripped and len(stripped) < 60 and
|
|
|
stripped == stripped.upper() and
|
|
|
re.search(r"[A-Z]{3,}", stripped) and
|
|
|
not stripped.startswith("[") and
|
|
|
i > 0 and not lines[i-1].strip()):
|
|
|
title_match = True
|
|
|
title_text = stripped.title()
|
|
|
else:
|
|
|
title_match = None
|
|
|
else:
|
|
|
title_text = title_match.group(1).title()
|
|
|
title_match = True
|
|
|
|
|
|
if title_match:
|
|
|
# Check for subtitle on next line
|
|
|
sub_title = ""
|
|
|
if i + 1 < len(lines) and lines[i + 1].strip():
|
|
|
sub = lines[i + 1].strip()
|
|
|
# Subtitles like "I LONELINESS--HER WORD"
|
|
|
if re.match(r"^_?[IVX]+\s", sub) or re.match(r"^\d+\s", sub):
|
|
|
pass # numbered sub-parts, don't treat as subtitle
|
|
|
|
|
|
if current_title and current_body_lines:
|
|
|
p = make_poem(
|
|
|
current_title,
|
|
|
"\n".join(current_body_lines),
|
|
|
"Robert Frost",
|
|
|
"Mountain Interval",
|
|
|
"1916",
|
|
|
)
|
|
|
if p:
|
|
|
poems.append(p)
|
|
|
|
|
|
current_title = title_text
|
|
|
current_body_lines = []
|
|
|
i += 1
|
|
|
continue
|
|
|
|
|
|
if current_title:
|
|
|
current_body_lines.append(line)
|
|
|
|
|
|
i += 1
|
|
|
|
|
|
if current_title and current_body_lines:
|
|
|
p = make_poem(
|
|
|
current_title,
|
|
|
"\n".join(current_body_lines),
|
|
|
"Robert Frost",
|
|
|
"Mountain Interval",
|
|
|
"1916",
|
|
|
)
|
|
|
if p:
|
|
|
poems.append(p)
|
|
|
|
|
|
return poems
|
|
|
|
|
|
|
|
|
def extract_frost_selected(text):
|
|
|
"""Robert Frost's Selected Poems (Gutenberg 59824)."""
|
|
|
body = extract_body(text)
|
|
|
poems = []
|
|
|
|
|
|
lines = body.split("\n")
|
|
|
|
|
|
# Find start of poems
|
|
|
start_idx = 0
|
|
|
for i, line in enumerate(lines):
|
|
|
if "THE PASTURE" in line and i > 50:
|
|
|
start_idx = i
|
|
|
break
|
|
|
|
|
|
current_title = ""
|
|
|
current_body_lines = []
|
|
|
|
|
|
i = start_idx
|
|
|
while i < len(lines):
|
|
|
line = lines[i]
|
|
|
stripped = line.strip()
|
|
|
|
|
|
# Section dividers like "I", "II", etc. (Roman numerals alone)
|
|
|
if re.match(r"^[IVX]+$", stripped) and i > 0 and not lines[i-1].strip():
|
|
|
i += 1
|
|
|
continue
|
|
|
|
|
|
# Title: ALL CAPS, short, after blank line
|
|
|
if (stripped and len(stripped) < 70 and
|
|
|
stripped == stripped.upper() and
|
|
|
re.search(r"[A-Z]{3,}", stripped) and
|
|
|
not stripped.startswith("[") and
|
|
|
not re.match(r"^[IVX]+$", stripped) and
|
|
|
i > 0 and not lines[i-1].strip()):
|
|
|
|
|
|
if current_title and current_body_lines:
|
|
|
p = make_poem(
|
|
|
current_title,
|
|
|
"\n".join(current_body_lines),
|
|
|
"Robert Frost",
|
|
|
"Selected Poems",
|
|
|
"1913–1916",
|
|
|
)
|
|
|
if p:
|
|
|
poems.append(p)
|
|
|
|
|
|
current_title = stripped.title()
|
|
|
current_body_lines = []
|
|
|
i += 1
|
|
|
continue
|
|
|
|
|
|
if current_title:
|
|
|
current_body_lines.append(line)
|
|
|
|
|
|
i += 1
|
|
|
|
|
|
if current_title and current_body_lines:
|
|
|
p = make_poem(
|
|
|
current_title,
|
|
|
"\n".join(current_body_lines),
|
|
|
"Robert Frost",
|
|
|
"Selected Poems",
|
|
|
"1913–1916",
|
|
|
)
|
|
|
if p:
|
|
|
poems.append(p)
|
|
|
|
|
|
return poems
|
|
|
|
|
|
|
|
|
def extract_yeats(text):
|
|
|
"""W.B. Yeats' The Wind Among the Reeds (Gutenberg 32233)."""
|
|
|
body = extract_body(text)
|
|
|
poems = []
|
|
|
|
|
|
# Poems have ALL CAPS titles separated by blank lines
|
|
|
# After CONTENTS section and dedication
|
|
|
# Find first poem
|
|
|
idx = body.find("THE HOSTING OF THE SIDHE\n")
|
|
|
if idx == -1:
|
|
|
return poems
|
|
|
|
|
|
# Search for the second occurrence (after CONTENTS)
|
|
|
idx2 = body.find("THE HOSTING OF THE SIDHE\n", idx + 10)
|
|
|
if idx2 != -1:
|
|
|
body = body[idx2:]
|
|
|
else:
|
|
|
body = body[idx:]
|
|
|
|
|
|
# Split on ALL CAPS title lines
|
|
|
lines = body.split("\n")
|
|
|
current_title = ""
|
|
|
current_body_lines = []
|
|
|
|
|
|
for i, line in enumerate(lines):
|
|
|
stripped = line.strip()
|
|
|
|
|
|
# Title: ALL CAPS, after blank line
|
|
|
if (stripped and
|
|
|
len(stripped) < 80 and
|
|
|
stripped == stripped.upper() and
|
|
|
re.search(r"[A-Z]{3,}", stripped) and
|
|
|
not stripped.startswith("[") and
|
|
|
not stripped.startswith("NOTE") and
|
|
|
(i == 0 or not lines[i-1].strip())):
|
|
|
|
|
|
if current_title and current_body_lines:
|
|
|
p = make_poem(
|
|
|
current_title,
|
|
|
"\n".join(current_body_lines),
|
|
|
"W.B. Yeats",
|
|
|
"The Wind Among the Reeds",
|
|
|
"1899",
|
|
|
)
|
|
|
if p:
|
|
|
poems.append(p)
|
|
|
|
|
|
current_title = stripped.title()
|
|
|
current_body_lines = []
|
|
|
continue
|
|
|
|
|
|
if current_title:
|
|
|
current_body_lines.append(line)
|
|
|
|
|
|
if current_title and current_body_lines:
|
|
|
p = make_poem(
|
|
|
current_title,
|
|
|
"\n".join(current_body_lines),
|
|
|
"W.B. Yeats",
|
|
|
"The Wind Among the Reeds",
|
|
|
"1899",
|
|
|
)
|
|
|
if p:
|
|
|
poems.append(p)
|
|
|
|
|
|
return poems
|
|
|
|
|
|
|
|
|
def extract_khayyam(text):
|
|
|
"""The Rubaiyat of Omar Khayyam (Gutenberg 246)."""
|
|
|
body = extract_body(text)
|
|
|
poems = []
|
|
|
|
|
|
# Extract both First and Fifth editions
|
|
|
for edition, label in [("First Edition", "First Edition"),
|
|
|
("Fifth Edition", "Fifth Edition")]:
|
|
|
# Find ALL occurrences and use the one that's followed by quatrains
|
|
|
positions = [m.start() for m in re.finditer(re.escape(edition), body)]
|
|
|
ed_start = None
|
|
|
for pos in positions:
|
|
|
# Check if this is followed by "\n\n\nI.\n\n" nearby
|
|
|
chunk = body[pos:pos+200]
|
|
|
if re.search(r"\n\n+I\.\n\n", chunk):
|
|
|
ed_start = pos
|
|
|
break
|
|
|
if ed_start is None:
|
|
|
continue
|
|
|
|
|
|
ed_body = body[ed_start:]
|
|
|
# Find the end: next edition or Notes section (far away)
|
|
|
# Look for next major section boundary
|
|
|
end_match = re.search(r"\n\n\n\n\n(First|Fifth) Edition", ed_body[200:])
|
|
|
notes_match = re.search(r"\n\n\n\n\nNotes", ed_body[200:])
|
|
|
|
|
|
if end_match:
|
|
|
ed_body = ed_body[:end_match.start() + 200]
|
|
|
elif notes_match:
|
|
|
ed_body = ed_body[:notes_match.start() + 200]
|
|
|
|
|
|
# Quatrains numbered: "I.\n\n" with varying leading newlines
|
|
|
parts = re.split(r"\n\n+([IVXLC]+)\.\n\n", ed_body)
|
|
|
for i in range(1, len(parts) - 1, 2):
|
|
|
numeral = parts[i].strip()
|
|
|
quatrain = parts[i + 1]
|
|
|
p = make_poem(
|
|
|
f"Quatrain {numeral} ({label})",
|
|
|
quatrain,
|
|
|
"Omar Khayyam (trans. Edward FitzGerald)",
|
|
|
"The Rubaiyat of Omar Khayyam",
|
|
|
"11th–12th century",
|
|
|
)
|
|
|
if p:
|
|
|
poems.append(p)
|
|
|
|
|
|
return poems
|
|
|
|
|
|
|
|
|
def extract_burns(text):
|
|
|
"""Poems and Songs of Robert Burns (Gutenberg 1279)."""
|
|
|
body = extract_body(text)
|
|
|
poems = []
|
|
|
|
|
|
lines = body.split("\n")
|
|
|
current_title = ""
|
|
|
current_body_lines = []
|
|
|
|
|
|
# Burns has a very large collection organized by year
|
|
|
# Titles are like "Song—Handsome Nell", "To A Mouse", etc.
|
|
|
# They appear after blank lines, are relatively short, and mixed case
|
|
|
|
|
|
skip_patterns = {"INTRODUCTORY NOTE", "GLOSSARY", "INDEX", "NOTES",
|
|
|
"APPENDIX", "CONTENTS", "PREFACE"}
|
|
|
|
|
|
i = 0
|
|
|
while i < len(lines):
|
|
|
line = lines[i]
|
|
|
stripped = line.strip()
|
|
|
|
|
|
# Year headings like "1771 - 1779" or "1780"
|
|
|
if re.match(r"^\d{4}(\s*[-–]\s*\d{4})?\s*$", stripped):
|
|
|
if current_title and current_body_lines:
|
|
|
p = make_poem(
|
|
|
current_title,
|
|
|
"\n".join(current_body_lines),
|
|
|
"Robert Burns",
|
|
|
"Poems and Songs of Robert Burns",
|
|
|
"1771–1796",
|
|
|
)
|
|
|
if p:
|
|
|
poems.append(p)
|
|
|
current_title = ""
|
|
|
current_body_lines = []
|
|
|
i += 1
|
|
|
continue
|
|
|
|
|
|
# Title detection: non-blank, short-ish, after blank line,
|
|
|
# not all lowercase, contains at least one uppercase word
|
|
|
if (stripped and
|
|
|
len(stripped) < 80 and
|
|
|
not line.startswith(" ") and # Not indented poem body
|
|
|
i > 0 and not lines[i-1].strip() and
|
|
|
re.search(r"[A-Z][a-z]", stripped) and
|
|
|
not stripped.startswith("[") and
|
|
|
not stripped.startswith("Footnote")):
|
|
|
|
|
|
# Additional checks for Burns titles
|
|
|
is_title = False
|
|
|
|
|
|
# Song titles: "Song—", "Ballad—", etc.
|
|
|
if re.match(r"^(Song|Ballad|Epistle|Elegy|Epitaph|Ode|Address|Epigram|Extempore|Fragment|Prologue|Lament|Lines|Stanzas|Verses|Inscription)[\s—\-:]", stripped):
|
|
|
is_title = True
|
|
|
# Titles starting with "To " or "On "
|
|
|
elif re.match(r"^(To |On |The |A |My |Tam |Holy |Poor |Bonnie |Highland )", stripped):
|
|
|
is_title = True
|
|
|
# ALL CAPS titles
|
|
|
elif stripped == stripped.upper() and len(stripped) > 5:
|
|
|
is_title = True
|
|
|
# Titles with special chars
|
|
|
elif "—" in stripped or stripped.endswith(":"):
|
|
|
is_title = True
|
|
|
# Check if next non-blank line is indented (poem body)
|
|
|
elif i + 1 < len(lines):
|
|
|
j = i + 1
|
|
|
while j < len(lines) and not lines[j].strip():
|
|
|
j += 1
|
|
|
if j < len(lines) and (lines[j].startswith(" ") or
|
|
|
lines[j].strip() != lines[j].strip().upper()):
|
|
|
# Could be a title, check more
|
|
|
if not any(stripped.upper().startswith(s) for s in skip_patterns):
|
|
|
is_title = True
|
|
|
|
|
|
if is_title:
|
|
|
# Check for multi-line titles
|
|
|
full_title = stripped
|
|
|
j = i + 1
|
|
|
while (j < len(lines) and lines[j].strip() and
|
|
|
not lines[j].startswith(" ") and
|
|
|
len(lines[j].strip()) < 60):
|
|
|
# Might be continuation of title
|
|
|
next_stripped = lines[j].strip()
|
|
|
if re.match(r"^(On |In |To |By |At |For |Or |And )", next_stripped):
|
|
|
full_title += " " + next_stripped
|
|
|
j += 1
|
|
|
else:
|
|
|
break
|
|
|
|
|
|
if current_title and current_body_lines:
|
|
|
p = make_poem(
|
|
|
current_title,
|
|
|
"\n".join(current_body_lines),
|
|
|
"Robert Burns",
|
|
|
"Poems and Songs of Robert Burns",
|
|
|
"1771–1796",
|
|
|
)
|
|
|
if p:
|
|
|
poems.append(p)
|
|
|
|
|
|
current_title = full_title
|
|
|
current_body_lines = []
|
|
|
i = j
|
|
|
continue
|
|
|
|
|
|
if current_title:
|
|
|
current_body_lines.append(line)
|
|
|
|
|
|
i += 1
|
|
|
|
|
|
if current_title and current_body_lines:
|
|
|
p = make_poem(
|
|
|
current_title,
|
|
|
"\n".join(current_body_lines),
|
|
|
"Robert Burns",
|
|
|
"Poems and Songs of Robert Burns",
|
|
|
"1771–1796",
|
|
|
)
|
|
|
if p:
|
|
|
poems.append(p)
|
|
|
|
|
|
return poems
|
|
|
|
|
|
|
|
|
def extract_wordsworth(text):
|
|
|
"""Lyrical Ballads by William Wordsworth (Gutenberg 9622)."""
|
|
|
body = extract_body(text)
|
|
|
poems = []
|
|
|
|
|
|
# Skip CONTENTS
|
|
|
lines = body.split("\n")
|
|
|
|
|
|
# Find where poems start (after CONTENTS section)
|
|
|
start_idx = 0
|
|
|
for i, line in enumerate(lines):
|
|
|
stripped = line.strip()
|
|
|
if stripped == "THE RIME OF THE ANCYENT MARINERE" and i > 50:
|
|
|
start_idx = i
|
|
|
break
|
|
|
|
|
|
current_title = ""
|
|
|
current_body_lines = []
|
|
|
|
|
|
i = start_idx
|
|
|
while i < len(lines):
|
|
|
line = lines[i]
|
|
|
stripped = line.strip()
|
|
|
|
|
|
# Title: ALL CAPS, after blank line
|
|
|
if (stripped and
|
|
|
len(stripped) < 80 and
|
|
|
stripped == stripped.upper() and
|
|
|
re.search(r"[A-Z]{3,}", stripped) and
|
|
|
not stripped.startswith("[") and
|
|
|
not stripped.startswith("NOTE") and
|
|
|
not stripped.startswith("***") and
|
|
|
not stripped.startswith("PART ") and
|
|
|
(i == 0 or not lines[i-1].strip())):
|
|
|
|
|
|
if current_title and current_body_lines:
|
|
|
p = make_poem(
|
|
|
current_title,
|
|
|
"\n".join(current_body_lines),
|
|
|
"William Wordsworth",
|
|
|
"Lyrical Ballads",
|
|
|
"1798",
|
|
|
)
|
|
|
if p:
|
|
|
poems.append(p)
|
|
|
|
|
|
current_title = stripped.title()
|
|
|
current_body_lines = []
|
|
|
i += 1
|
|
|
continue
|
|
|
|
|
|
if current_title:
|
|
|
current_body_lines.append(line)
|
|
|
|
|
|
i += 1
|
|
|
|
|
|
if current_title and current_body_lines:
|
|
|
p = make_poem(
|
|
|
current_title,
|
|
|
"\n".join(current_body_lines),
|
|
|
"William Wordsworth",
|
|
|
"Lyrical Ballads",
|
|
|
"1798",
|
|
|
)
|
|
|
if p:
|
|
|
poems.append(p)
|
|
|
|
|
|
return poems
|
|
|
|
|
|
|
|
|
def extract_shelley(text):
|
|
|
"""Complete Poetical Works of Shelley (Gutenberg 4800)."""
|
|
|
body = extract_body(text)
|
|
|
poems = []
|
|
|
|
|
|
lines = body.split("\n")
|
|
|
current_title = ""
|
|
|
current_body_lines = []
|
|
|
|
|
|
skip_titles = {"CONTENTS", "NOTE", "NOTES", "PREFACE", "INTRODUCTION",
|
|
|
"APPENDIX", "DEDICATION", "ADVERTISEMENTS", "MEMOIR",
|
|
|
"POSTSCRIPT", "DRAMATIS PERSONAE", "INDEX",
|
|
|
"BIBLIOGRAPHY", "TABLE OF CONTENTS"}
|
|
|
# Skip editorial / prose content patterns
|
|
|
skip_prefixes = ("NOTE BY", "TO ", "INCLUDING", "EDITED", "THOMAS",
|
|
|
"MARY W", "LONDON", "POSTSCRIPT")
|
|
|
|
|
|
i = 0
|
|
|
while i < len(lines):
|
|
|
line = lines[i]
|
|
|
stripped = line.strip()
|
|
|
|
|
|
# Title: ALL CAPS, short, after blank line
|
|
|
if (stripped and
|
|
|
3 < len(stripped) < 70 and
|
|
|
stripped == stripped.upper() and
|
|
|
re.search(r"[A-Z]{3,}", stripped) and
|
|
|
not stripped.startswith("[") and
|
|
|
not stripped.startswith("***") and
|
|
|
not re.match(r"^(ACT|SCENE|PART)\s", stripped) and
|
|
|
not re.match(r"^[IVX]+\.$", stripped) and
|
|
|
not re.match(r"^\d+\.", stripped) and
|
|
|
(i == 0 or not lines[i-1].strip())):
|
|
|
|
|
|
title_word = stripped.split(".")[0].strip()
|
|
|
if title_word in skip_titles:
|
|
|
i += 1
|
|
|
continue
|
|
|
# Skip notes entries
|
|
|
if stripped.startswith("NOTES"):
|
|
|
i += 1
|
|
|
continue
|
|
|
|
|
|
if current_title and current_body_lines:
|
|
|
poem_text = "\n".join(current_body_lines)
|
|
|
# Only keep poems with real verse content
|
|
|
# Skip TOC entries (mostly CANTO/CHAPTER lines) and notes
|
|
|
cleaned = poem_text.strip()
|
|
|
if (len(cleaned) > 200 and
|
|
|
not cleaned.startswith("PREFACE") and
|
|
|
not cleaned.startswith("CANTO") and
|
|
|
not re.match(r"^(PREFACE|DEDICATION|CANTO|PART|NOTE|_\d)", cleaned)):
|
|
|
p = make_poem(
|
|
|
current_title,
|
|
|
poem_text,
|
|
|
"Percy Bysshe Shelley",
|
|
|
"Complete Poetical Works of Shelley",
|
|
|
"1810–1822",
|
|
|
)
|
|
|
if p:
|
|
|
poems.append(p)
|
|
|
|
|
|
current_title = stripped.title()
|
|
|
# Skip editorial prefixes
|
|
|
if any(stripped.startswith(sp) for sp in skip_prefixes):
|
|
|
current_title = ""
|
|
|
current_body_lines = []
|
|
|
i += 1
|
|
|
continue
|
|
|
current_body_lines = []
|
|
|
i += 1
|
|
|
continue
|
|
|
|
|
|
if current_title:
|
|
|
current_body_lines.append(line)
|
|
|
|
|
|
i += 1
|
|
|
|
|
|
if current_title and current_body_lines:
|
|
|
poem_text = "\n".join(current_body_lines)
|
|
|
cleaned = poem_text.strip()
|
|
|
if (len(cleaned) > 200 and
|
|
|
not cleaned.startswith("PREFACE") and
|
|
|
not cleaned.startswith("CANTO") and
|
|
|
not re.match(r"^(PREFACE|DEDICATION|CANTO|PART|NOTE|_\d)", cleaned)):
|
|
|
p = make_poem(
|
|
|
current_title,
|
|
|
poem_text,
|
|
|
"Percy Bysshe Shelley",
|
|
|
"Complete Poetical Works of Shelley",
|
|
|
"1810–1822",
|
|
|
)
|
|
|
if p:
|
|
|
poems.append(p)
|
|
|
|
|
|
return poems
|
|
|
|
|
|
|
|
|
# ─── Sources ─────────────────────────────────────────────────────
|
|
|
|
|
|
SOURCES = [
|
|
|
{
|
|
|
"id": 1041,
|
|
|
"filename": "shakespeare_sonnets.json",
|
|
|
"title": "Shakespeare's Sonnets",
|
|
|
"author": "William Shakespeare",
|
|
|
"extractor": extract_shakespeare_sonnets,
|
|
|
},
|
|
|
{
|
|
|
"id": 12242,
|
|
|
"filename": "dickinson_poems.json",
|
|
|
"title": "Poems by Emily Dickinson",
|
|
|
"author": "Emily Dickinson",
|
|
|
"extractor": extract_dickinson,
|
|
|
},
|
|
|
{
|
|
|
"id": 1322,
|
|
|
"filename": "whitman_leaves_of_grass.json",
|
|
|
"title": "Leaves of Grass",
|
|
|
"author": "Walt Whitman",
|
|
|
"extractor": extract_whitman,
|
|
|
},
|
|
|
{
|
|
|
"id": 1934,
|
|
|
"filename": "blake_songs.json",
|
|
|
"title": "Songs of Innocence and of Experience",
|
|
|
"author": "William Blake",
|
|
|
"extractor": extract_blake,
|
|
|
},
|
|
|
{
|
|
|
"id": 23684,
|
|
|
"filename": "keats_poems_1820.json",
|
|
|
"title": "Poems Published in 1820",
|
|
|
"author": "John Keats",
|
|
|
"extractor": extract_keats,
|
|
|
},
|
|
|
{
|
|
|
"id": 10031,
|
|
|
"filename": "poe_poetical_works.json",
|
|
|
"title": "Complete Poetical Works",
|
|
|
"author": "Edgar Allan Poe",
|
|
|
"extractor": extract_poe,
|
|
|
},
|
|
|
{
|
|
|
"id": 2002,
|
|
|
"filename": "browning_sonnets_portuguese.json",
|
|
|
"title": "Sonnets from the Portuguese",
|
|
|
"author": "Elizabeth Barrett Browning",
|
|
|
"extractor": extract_browning_sonnets,
|
|
|
},
|
|
|
{
|
|
|
"id": 1321,
|
|
|
"filename": "eliot_waste_land.json",
|
|
|
"title": "The Waste Land",
|
|
|
"author": "T.S. Eliot",
|
|
|
"extractor": extract_eliot_wasteland,
|
|
|
},
|
|
|
{
|
|
|
"id": 29345,
|
|
|
"filename": "frost_mountain_interval.json",
|
|
|
"title": "Mountain Interval",
|
|
|
"author": "Robert Frost",
|
|
|
"extractor": extract_frost_mountain,
|
|
|
},
|
|
|
{
|
|
|
"id": 59824,
|
|
|
"filename": "frost_selected_poems.json",
|
|
|
"title": "Selected Poems",
|
|
|
"author": "Robert Frost",
|
|
|
"extractor": extract_frost_selected,
|
|
|
},
|
|
|
{
|
|
|
"id": 32233,
|
|
|
"filename": "yeats_wind_reeds.json",
|
|
|
"title": "The Wind Among the Reeds",
|
|
|
"author": "W.B. Yeats",
|
|
|
"extractor": extract_yeats,
|
|
|
},
|
|
|
{
|
|
|
"id": 246,
|
|
|
"filename": "khayyam_rubaiyat.json",
|
|
|
"title": "The Rubaiyat of Omar Khayyam",
|
|
|
"author": "Omar Khayyam",
|
|
|
"extractor": extract_khayyam,
|
|
|
},
|
|
|
{
|
|
|
"id": 1279,
|
|
|
"filename": "burns_poems_songs.json",
|
|
|
"title": "Poems and Songs",
|
|
|
"author": "Robert Burns",
|
|
|
"extractor": extract_burns,
|
|
|
},
|
|
|
{
|
|
|
"id": 9622,
|
|
|
"filename": "wordsworth_lyrical_ballads.json",
|
|
|
"title": "Lyrical Ballads",
|
|
|
"author": "William Wordsworth",
|
|
|
"extractor": extract_wordsworth,
|
|
|
},
|
|
|
{
|
|
|
"id": 4800,
|
|
|
"filename": "shelley_poetical_works.json",
|
|
|
"title": "Complete Poetical Works",
|
|
|
"author": "Percy Bysshe Shelley",
|
|
|
"extractor": extract_shelley,
|
|
|
},
|
|
|
]
|
|
|
|
|
|
|
|
|
def download_source(source):
|
|
|
"""Download and parse one source."""
|
|
|
text = fetch_text(source["id"])
|
|
|
poems = source["extractor"](text)
|
|
|
print(f" Extracted {len(poems)} poems")
|
|
|
return poems
|
|
|
|
|
|
|
|
|
def save_poems(poems, filename):
|
|
|
"""Save poems to JSON file."""
|
|
|
filepath = os.path.join(POETRY_DIR, filename)
|
|
|
with open(filepath, "w", encoding="utf-8") as f:
|
|
|
json.dump(poems, f, indent=2, ensure_ascii=False)
|
|
|
print(f" Saved to {filepath}")
|
|
|
|
|
|
|
|
|
def main():
|
|
|
if "--list" in sys.argv:
|
|
|
print("Available poetry sources:\n")
|
|
|
for s in SOURCES:
|
|
|
print(f" {s['author']:35s} {s['title']}")
|
|
|
print(f" {'':35s} Gutenberg #{s['id']}")
|
|
|
print()
|
|
|
return
|
|
|
|
|
|
os.makedirs(POETRY_DIR, exist_ok=True)
|
|
|
|
|
|
total = 0
|
|
|
for source in SOURCES:
|
|
|
print(f"\n{'='*60}")
|
|
|
print(f" {source['author']} — {source['title']}")
|
|
|
print(f" Gutenberg #{source['id']}")
|
|
|
print(f"{'='*60}")
|
|
|
try:
|
|
|
poems = download_source(source)
|
|
|
if poems:
|
|
|
save_poems(poems, source["filename"])
|
|
|
total += len(poems)
|
|
|
else:
|
|
|
print(" WARNING: No poems extracted!")
|
|
|
except Exception as e:
|
|
|
print(f" ERROR: {e}")
|
|
|
|
|
|
print(f"\n{'='*60}")
|
|
|
print(f" Total: {total} poems downloaded")
|
|
|
print(f"{'='*60}")
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
main()
|