Add poetry collection: 3,155 poems from 15 Gutenberg sources
New files:
- download_poetry.py: Download/parse script with 15 extractors
- poetry/*.json: Pre-parsed poetry from Project Gutenberg
Poets included:
Shakespeare (154), Dickinson (439), Whitman (383),
Blake (43), Keats (10), Poe (108), E.B. Browning (44),
T.S. Eliot (5), Frost (82), Yeats (48), Khayyam (176),
Burns (563), Wordsworth (51), Shelley (1049)
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 week ago
|
|
|
|
#!/usr/bin/env python3
|
|
|
|
|
|
"""Download and parse poetry collections from Project Gutenberg.
|
|
|
|
|
|
|
|
|
|
|
|
This is a maintainer-only tool. End users should use the pre-downloaded
|
|
|
|
|
|
poetry files in the poetry/ directory.
|
|
|
|
|
|
|
|
|
|
|
|
Usage:
|
|
|
|
|
|
python download_poetry.py # Download all sources
|
|
|
|
|
|
python download_poetry.py --list # List available sources
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
import json
|
|
|
|
|
|
import os
|
|
|
|
|
|
import re
|
|
|
|
|
|
import sys
|
|
|
|
|
|
import urllib.request
|
|
|
|
|
|
|
|
|
|
|
|
POETRY_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "poetry")
|
|
|
|
|
|
GUTENBERG_URL = "https://www.gutenberg.org/cache/epub/{id}/pg{id}.txt"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def fetch_text(gutenberg_id):
|
|
|
|
|
|
"""Download and return cleaned text from Project Gutenberg."""
|
|
|
|
|
|
url = GUTENBERG_URL.format(id=gutenberg_id)
|
|
|
|
|
|
print(f" Downloading {url} ...")
|
|
|
|
|
|
req = urllib.request.Request(url, headers={"User-Agent": "PoetryDownloader/1.0"})
|
|
|
|
|
|
with urllib.request.urlopen(req, timeout=30) as resp:
|
|
|
|
|
|
raw = resp.read().decode("utf-8-sig")
|
|
|
|
|
|
# Normalize line endings
|
|
|
|
|
|
raw = raw.replace("\r\n", "\n").replace("\r", "\n")
|
|
|
|
|
|
return raw
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def extract_body(text):
|
|
|
|
|
|
"""Extract text between Gutenberg START/END markers."""
|
|
|
|
|
|
start = re.search(r"\*\*\*\s*START OF.*?\*\*\*", text)
|
|
|
|
|
|
end = re.search(r"\*\*\*\s*END OF.*?\*\*\*", text)
|
|
|
|
|
|
if start and end:
|
|
|
|
|
|
return text[start.end():end.start()]
|
|
|
|
|
|
if start:
|
|
|
|
|
|
return text[start.end():]
|
|
|
|
|
|
return text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def clean_poem(text):
|
|
|
|
|
|
"""Clean whitespace from a poem body."""
|
|
|
|
|
|
lines = text.split("\n")
|
|
|
|
|
|
# Strip trailing whitespace from each line
|
|
|
|
|
|
lines = [l.rstrip() for l in lines]
|
|
|
|
|
|
# Remove leading/trailing blank lines
|
|
|
|
|
|
while lines and not lines[0].strip():
|
|
|
|
|
|
lines.pop(0)
|
|
|
|
|
|
while lines and not lines[-1].strip():
|
|
|
|
|
|
lines.pop()
|
|
|
|
|
|
return "\n".join(lines)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def make_poem(title, body, author, source, period):
|
|
|
|
|
|
"""Create a poem dict, skipping empty poems."""
|
|
|
|
|
|
body = clean_poem(body)
|
|
|
|
|
|
if not body or len(body.strip()) < 20:
|
|
|
|
|
|
return None
|
|
|
|
|
|
return {
|
|
|
|
|
|
"title": title.strip(),
|
|
|
|
|
|
"body": body,
|
|
|
|
|
|
"author": author,
|
|
|
|
|
|
"source": source,
|
|
|
|
|
|
"period": period,
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# ─── Extractors ─────────────────────────────────────────────────
|
|
|
|
|
|
|
|
|
|
|
|
def extract_shakespeare_sonnets(text):
|
|
|
|
|
|
"""Shakespeare's Sonnets (Gutenberg 1041)."""
|
|
|
|
|
|
body = extract_body(text)
|
|
|
|
|
|
poems = []
|
|
|
|
|
|
# Sonnets are separated by Roman numeral on its own line with double newlines
|
|
|
|
|
|
parts = re.split(r"\n\n([IVXLC]+)\n\n", body)
|
|
|
|
|
|
# parts[0] is intro, then alternating: numeral, body
|
|
|
|
|
|
for i in range(1, len(parts) - 1, 2):
|
|
|
|
|
|
numeral = parts[i].strip()
|
|
|
|
|
|
poem_body = parts[i + 1]
|
|
|
|
|
|
p = make_poem(
|
|
|
|
|
|
f"Sonnet {numeral}",
|
|
|
|
|
|
poem_body,
|
|
|
|
|
|
"William Shakespeare",
|
|
|
|
|
|
"Shakespeare's Sonnets",
|
|
|
|
|
|
"1609",
|
|
|
|
|
|
)
|
|
|
|
|
|
if p:
|
|
|
|
|
|
poems.append(p)
|
|
|
|
|
|
return poems
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def extract_dickinson(text):
|
|
|
|
|
|
"""Emily Dickinson's Poems, Three Series (Gutenberg 12242)."""
|
|
|
|
|
|
body = extract_body(text)
|
|
|
|
|
|
poems = []
|
|
|
|
|
|
|
|
|
|
|
|
# Skip the preface - find first poem section
|
|
|
|
|
|
# Poems are grouped by topic headings (e.g., "I. LIFE.", "II. LOVE.")
|
|
|
|
|
|
# Individual poems have titles in ALL CAPS or Roman numerals
|
|
|
|
|
|
|
|
|
|
|
|
# Split into series sections
|
|
|
|
|
|
# Each poem is preceded by a blank line and a title line (usually short, often ALL CAPS)
|
|
|
|
|
|
# The structure is: section headings, then poems with short titles
|
|
|
|
|
|
|
|
|
|
|
|
# Strategy: find patterns like title lines followed by poem bodies
|
|
|
|
|
|
lines = body.split("\n")
|
|
|
|
|
|
|
|
|
|
|
|
current_title = ""
|
|
|
|
|
|
current_body_lines = []
|
|
|
|
|
|
in_poem = False
|
|
|
|
|
|
section = ""
|
|
|
|
|
|
|
|
|
|
|
|
# Skip to after TRANSCRIBER'S NOTE section and the first poem
|
|
|
|
|
|
start_idx = 0
|
|
|
|
|
|
for i, line in enumerate(lines):
|
|
|
|
|
|
if line.strip() == "I. LIFE.":
|
|
|
|
|
|
start_idx = i
|
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
|
|
i = start_idx
|
|
|
|
|
|
while i < len(lines):
|
|
|
|
|
|
line = lines[i]
|
|
|
|
|
|
stripped = line.strip()
|
|
|
|
|
|
|
|
|
|
|
|
# Section headers like "I. LIFE.", "II. LOVE.", etc.
|
|
|
|
|
|
if re.match(r"^[IVX]+\.\s+[A-Z]+\.\s*$", stripped):
|
|
|
|
|
|
# Save previous poem
|
|
|
|
|
|
if current_title and current_body_lines:
|
|
|
|
|
|
p = make_poem(
|
|
|
|
|
|
current_title,
|
|
|
|
|
|
"\n".join(current_body_lines),
|
|
|
|
|
|
"Emily Dickinson",
|
|
|
|
|
|
"Poems by Emily Dickinson, Three Series",
|
|
|
|
|
|
"1890–1896",
|
|
|
|
|
|
)
|
|
|
|
|
|
if p:
|
|
|
|
|
|
poems.append(p)
|
|
|
|
|
|
current_title = ""
|
|
|
|
|
|
current_body_lines = []
|
|
|
|
|
|
section = stripped
|
|
|
|
|
|
i += 1
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
|
|
# Series dividers
|
|
|
|
|
|
if "SERIES" in stripped and "SECOND" in stripped or "THIRD" in stripped:
|
|
|
|
|
|
if current_title and current_body_lines:
|
|
|
|
|
|
p = make_poem(
|
|
|
|
|
|
current_title,
|
|
|
|
|
|
"\n".join(current_body_lines),
|
|
|
|
|
|
"Emily Dickinson",
|
|
|
|
|
|
"Poems by Emily Dickinson, Three Series",
|
|
|
|
|
|
"1890–1896",
|
|
|
|
|
|
)
|
|
|
|
|
|
if p:
|
|
|
|
|
|
poems.append(p)
|
|
|
|
|
|
current_title = ""
|
|
|
|
|
|
current_body_lines = []
|
|
|
|
|
|
i += 1
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
|
|
# Poem title: short line, mostly caps, after blank line
|
|
|
|
|
|
# Dickinson titles are typically short phrases in ALL CAPS
|
|
|
|
|
|
if (stripped and
|
|
|
|
|
|
len(stripped) < 60 and
|
|
|
|
|
|
not stripped.startswith("[") and
|
|
|
|
|
|
stripped == stripped.upper() and
|
|
|
|
|
|
re.search(r"[A-Z]{2,}", stripped) and
|
|
|
|
|
|
not re.match(r"^[IVX]+\.\s+[A-Z]+\.\s*$", stripped) and
|
|
|
|
|
|
i > 0 and not lines[i-1].strip()):
|
|
|
|
|
|
|
|
|
|
|
|
# Looks like a title - save previous poem
|
|
|
|
|
|
if current_title and current_body_lines:
|
|
|
|
|
|
p = make_poem(
|
|
|
|
|
|
current_title,
|
|
|
|
|
|
"\n".join(current_body_lines),
|
|
|
|
|
|
"Emily Dickinson",
|
|
|
|
|
|
"Poems by Emily Dickinson, Three Series",
|
|
|
|
|
|
"1890–1896",
|
|
|
|
|
|
)
|
|
|
|
|
|
if p:
|
|
|
|
|
|
poems.append(p)
|
|
|
|
|
|
|
|
|
|
|
|
current_title = stripped.title()
|
|
|
|
|
|
current_body_lines = []
|
|
|
|
|
|
in_poem = True
|
|
|
|
|
|
i += 1
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
|
|
if in_poem:
|
|
|
|
|
|
current_body_lines.append(line)
|
|
|
|
|
|
|
|
|
|
|
|
i += 1
|
|
|
|
|
|
|
|
|
|
|
|
# Don't forget last poem
|
|
|
|
|
|
if current_title and current_body_lines:
|
|
|
|
|
|
p = make_poem(
|
|
|
|
|
|
current_title,
|
|
|
|
|
|
"\n".join(current_body_lines),
|
|
|
|
|
|
"Emily Dickinson",
|
|
|
|
|
|
"Poems by Emily Dickinson, Three Series",
|
|
|
|
|
|
"1890–1896",
|
|
|
|
|
|
)
|
|
|
|
|
|
if p:
|
|
|
|
|
|
poems.append(p)
|
|
|
|
|
|
|
|
|
|
|
|
return poems
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def extract_whitman(text):
|
|
|
|
|
|
"""Walt Whitman's Leaves of Grass (Gutenberg 1322)."""
|
|
|
|
|
|
body = extract_body(text)
|
|
|
|
|
|
poems = []
|
|
|
|
|
|
|
|
|
|
|
|
# Poems have titles on their own lines, followed by poem text
|
|
|
|
|
|
# Some are prefixed with "BOOK I." etc.
|
|
|
|
|
|
# Title lines are typically short, not indented much
|
|
|
|
|
|
lines = body.split("\n")
|
|
|
|
|
|
|
|
|
|
|
|
# Skip the initial inscription
|
|
|
|
|
|
start_idx = 0
|
|
|
|
|
|
for i, line in enumerate(lines):
|
|
|
|
|
|
if "One's-Self I Sing" in line:
|
|
|
|
|
|
start_idx = i
|
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
|
|
current_title = ""
|
|
|
|
|
|
current_body_lines = []
|
|
|
|
|
|
|
|
|
|
|
|
# Whitman's poems: title is a non-indented, relatively short line
|
|
|
|
|
|
# followed by indented poem body
|
|
|
|
|
|
# Some titles span multiple lines (rare)
|
|
|
|
|
|
i = start_idx
|
|
|
|
|
|
while i < len(lines):
|
|
|
|
|
|
line = lines[i]
|
|
|
|
|
|
stripped = line.strip()
|
|
|
|
|
|
|
|
|
|
|
|
# Skip BOOK headers
|
|
|
|
|
|
if re.match(r"^BOOK\s+[IVXLC]+", stripped):
|
|
|
|
|
|
i += 1
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
|
|
# Title detection: non-blank, mostly un-indented, short-ish
|
|
|
|
|
|
# Titles in Whitman are like "Song of Myself", "I Sing the Body Electric"
|
|
|
|
|
|
# They are followed by blank line then indented body
|
|
|
|
|
|
if (stripped and
|
|
|
|
|
|
len(stripped) < 80 and
|
|
|
|
|
|
not line.startswith(" ") and
|
|
|
|
|
|
not stripped.startswith("[") and
|
|
|
|
|
|
not stripped.startswith("BOOK") and
|
|
|
|
|
|
i > 0 and not lines[i-1].strip()):
|
|
|
|
|
|
|
|
|
|
|
|
# Check if next non-blank line is indented (poem body)
|
|
|
|
|
|
j = i + 1
|
|
|
|
|
|
while j < len(lines) and not lines[j].strip():
|
|
|
|
|
|
j += 1
|
|
|
|
|
|
if j < len(lines) and lines[j].startswith(" "):
|
|
|
|
|
|
# This is a title
|
|
|
|
|
|
if current_title and current_body_lines:
|
|
|
|
|
|
p = make_poem(
|
|
|
|
|
|
current_title,
|
|
|
|
|
|
"\n".join(current_body_lines),
|
|
|
|
|
|
"Walt Whitman",
|
|
|
|
|
|
"Leaves of Grass",
|
|
|
|
|
|
"1891–1892",
|
|
|
|
|
|
)
|
|
|
|
|
|
if p:
|
|
|
|
|
|
poems.append(p)
|
|
|
|
|
|
current_title = stripped
|
|
|
|
|
|
current_body_lines = []
|
|
|
|
|
|
i += 1
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
|
|
if current_title:
|
|
|
|
|
|
current_body_lines.append(line)
|
|
|
|
|
|
|
|
|
|
|
|
i += 1
|
|
|
|
|
|
|
|
|
|
|
|
if current_title and current_body_lines:
|
|
|
|
|
|
p = make_poem(
|
|
|
|
|
|
current_title,
|
|
|
|
|
|
"\n".join(current_body_lines),
|
|
|
|
|
|
"Walt Whitman",
|
|
|
|
|
|
"Leaves of Grass",
|
|
|
|
|
|
"1891–1892",
|
|
|
|
|
|
)
|
|
|
|
|
|
if p:
|
|
|
|
|
|
poems.append(p)
|
|
|
|
|
|
|
|
|
|
|
|
return poems
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def extract_blake(text):
|
|
|
|
|
|
"""William Blake's Songs of Innocence and of Experience (Gutenberg 1934)."""
|
|
|
|
|
|
body = extract_body(text)
|
|
|
|
|
|
poems = []
|
|
|
|
|
|
|
|
|
|
|
|
# Remove [Picture: ...] tags
|
|
|
|
|
|
body = re.sub(r"\[Picture:.*?\]", "", body, flags=re.DOTALL)
|
|
|
|
|
|
|
|
|
|
|
|
# Find all poem titles (ALL CAPS on their own line, 3+ newlines before)
|
|
|
|
|
|
# Build a list of (title, start_pos) then extract bodies between them
|
|
|
|
|
|
title_pattern = re.compile(r"\n\n\n([A-Z][A-Z ,'!?:;\-—]+)\n")
|
|
|
|
|
|
matches = list(title_pattern.finditer(body))
|
|
|
|
|
|
|
|
|
|
|
|
skip_titles = {"SONGS OF INNOCENCE", "SONGS OF EXPERIENCE", "CONTENTS",
|
|
|
|
|
|
"SONGS OF INNOCENCE AND OF EXPERIENCE"}
|
|
|
|
|
|
|
|
|
|
|
|
for idx, m in enumerate(matches):
|
|
|
|
|
|
title = m.group(1).strip()
|
|
|
|
|
|
if title in skip_titles:
|
|
|
|
|
|
continue
|
|
|
|
|
|
start = m.end()
|
|
|
|
|
|
end = matches[idx + 1].start() if idx + 1 < len(matches) else len(body)
|
|
|
|
|
|
poem_body = body[start:end]
|
|
|
|
|
|
|
|
|
|
|
|
p = make_poem(
|
|
|
|
|
|
title.title(),
|
|
|
|
|
|
poem_body,
|
|
|
|
|
|
"William Blake",
|
|
|
|
|
|
"Songs of Innocence and of Experience",
|
|
|
|
|
|
"1789–1794",
|
|
|
|
|
|
)
|
|
|
|
|
|
if p:
|
|
|
|
|
|
poems.append(p)
|
|
|
|
|
|
|
|
|
|
|
|
return poems
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def extract_keats(text):
|
|
|
|
|
|
"""Keats: Poems Published in 1820 (Gutenberg 23684)."""
|
|
|
|
|
|
body = extract_body(text)
|
|
|
|
|
|
poems = []
|
|
|
|
|
|
|
|
|
|
|
|
# Extract the main poems using CONTENTS as a guide
|
|
|
|
|
|
# The poems are: LAMIA (Part I & II), ISABELLA, THE EVE OF ST. AGNES,
|
|
|
|
|
|
# various Odes, FANCY, HYPERION (Books I-III), etc.
|
|
|
|
|
|
# Followed by NOTES sections
|
|
|
|
|
|
|
|
|
|
|
|
# Find where actual poems start (after the CONTENTS and LIFE OF KEATS sections)
|
|
|
|
|
|
advert_match = re.search(r"\nADVERTISEMENT\.?\n", body)
|
|
|
|
|
|
if advert_match:
|
|
|
|
|
|
body = body[advert_match.start():]
|
|
|
|
|
|
|
|
|
|
|
|
# Split on major poem titles - these appear in ALL CAPS on their own
|
|
|
|
|
|
# We'll manually identify poem boundaries
|
|
|
|
|
|
poem_titles = [
|
|
|
|
|
|
("ADVERTISEMENT", "LAMIA"),
|
|
|
|
|
|
("LAMIA. PART I", "LAMIA. PART II"),
|
|
|
|
|
|
("LAMIA. PART II", "ISABELLA"),
|
|
|
|
|
|
("ISABELLA; OR, THE POT OF BASIL", "THE EVE OF ST. AGNES"),
|
|
|
|
|
|
("THE EVE OF ST. AGNES", "ODE TO A NIGHTINGALE"),
|
|
|
|
|
|
("ODE TO A NIGHTINGALE", "ODE ON A GRECIAN URN"),
|
|
|
|
|
|
("ODE ON A GRECIAN URN", "ODE TO PSYCHE"),
|
|
|
|
|
|
("ODE TO PSYCHE", "FANCY"),
|
|
|
|
|
|
("FANCY", "ODE"),
|
|
|
|
|
|
("ODE\n", "LINES ON THE MERMAID TAVERN"),
|
|
|
|
|
|
("LINES ON THE MERMAID TAVERN", "ROBIN HOOD"),
|
|
|
|
|
|
("ROBIN HOOD", "TO AUTUMN"),
|
|
|
|
|
|
("TO AUTUMN", "ODE ON MELANCHOLY"),
|
|
|
|
|
|
("ODE ON MELANCHOLY", "HYPERION"),
|
|
|
|
|
|
("HYPERION. BOOK I", "HYPERION. BOOK II"),
|
|
|
|
|
|
("HYPERION. BOOK II", "HYPERION. BOOK III"),
|
|
|
|
|
|
("HYPERION. BOOK III", "NOTE ON ADVERTISEMENT"),
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
for title_start, title_end in poem_titles:
|
|
|
|
|
|
start = body.find(title_start)
|
|
|
|
|
|
end = body.find(title_end, start + len(title_start)) if title_end else len(body)
|
|
|
|
|
|
if start == -1:
|
|
|
|
|
|
continue
|
|
|
|
|
|
if end == -1:
|
|
|
|
|
|
end = len(body)
|
|
|
|
|
|
|
|
|
|
|
|
section = body[start:end]
|
|
|
|
|
|
# Remove the title line itself
|
|
|
|
|
|
first_newline = section.find("\n")
|
|
|
|
|
|
if first_newline != -1:
|
|
|
|
|
|
poem_body = section[first_newline:]
|
|
|
|
|
|
else:
|
|
|
|
|
|
poem_body = section
|
|
|
|
|
|
|
|
|
|
|
|
# Clean title
|
|
|
|
|
|
display_title = title_start.replace(". PART ", ", Part ").title()
|
|
|
|
|
|
if display_title == "Advertisement":
|
|
|
|
|
|
continue
|
|
|
|
|
|
if display_title.startswith("Ode\n"):
|
|
|
|
|
|
display_title = "Ode (Bards of Passion and of Mirth)"
|
|
|
|
|
|
|
|
|
|
|
|
# Remove editorial line numbers
|
|
|
|
|
|
poem_body = re.sub(r"\s+\d+$", "", poem_body, flags=re.MULTILINE)
|
|
|
|
|
|
# Remove footnote markers
|
|
|
|
|
|
poem_body = re.sub(r"\[\d+\]", "", poem_body)
|
|
|
|
|
|
|
|
|
|
|
|
p = make_poem(
|
|
|
|
|
|
display_title,
|
|
|
|
|
|
poem_body,
|
|
|
|
|
|
"John Keats",
|
|
|
|
|
|
"Poems Published in 1820",
|
|
|
|
|
|
"1820",
|
|
|
|
|
|
)
|
|
|
|
|
|
if p:
|
|
|
|
|
|
poems.append(p)
|
|
|
|
|
|
|
|
|
|
|
|
return poems
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def extract_poe(text):
|
|
|
|
|
|
"""Complete Poetical Works of Edgar Allan Poe (Gutenberg 10031)."""
|
|
|
|
|
|
body = extract_body(text)
|
|
|
|
|
|
poems = []
|
|
|
|
|
|
lines_list = body.split("\n")
|
|
|
|
|
|
|
|
|
|
|
|
# Top-level sections containing actual poems
|
|
|
|
|
|
poem_sections = {"POEMS OF LATER LIFE", "POEMS OF MANHOOD",
|
|
|
|
|
|
"POEMS OF YOUTH", "DOUBTFUL POEMS"}
|
|
|
|
|
|
# Top-level sections that are NOT poems
|
|
|
|
|
|
non_poem_sections = {"CONTENTS", "TABLE OF CONTENTS", "NOTES", "NOTE",
|
|
|
|
|
|
"MEMOIR", "MEMOIR OF EDGAR ALLAN POE",
|
|
|
|
|
|
"PROSE POEMS", "ESSAYS", "NOTE ON POLITIAN",
|
|
|
|
|
|
'SCENES FROM "POLITIAN"'}
|
|
|
|
|
|
# Sub-headings within poem sections to skip
|
|
|
|
|
|
skip_titles = {"PREFACE", "INTRODUCTION TO POEMS1831",
|
|
|
|
|
|
"LETTER TO MR B", "JOHN H INGRAM",
|
|
|
|
|
|
"THE NOBLEST OF HER SEX",
|
|
|
|
|
|
"MISS ELIZABETH BARRETT BARRETT",
|
|
|
|
|
|
"OF ENGLAND", "I DEDICATE THIS VOLUME",
|
|
|
|
|
|
"WITH THE MOST ENTHUSIASTIC ADMIRATION AND",
|
|
|
|
|
|
"WEST POINT 1831", "DEAR B"}
|
|
|
|
|
|
|
|
|
|
|
|
in_poem_section = False
|
Add poetry collection: 3,155 poems from 15 Gutenberg sources
New files:
- download_poetry.py: Download/parse script with 15 extractors
- poetry/*.json: Pre-parsed poetry from Project Gutenberg
Poets included:
Shakespeare (154), Dickinson (439), Whitman (383),
Blake (43), Keats (10), Poe (108), E.B. Browning (44),
T.S. Eliot (5), Frost (82), Yeats (48), Khayyam (176),
Burns (563), Wordsworth (51), Shelley (1049)
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 week ago
|
|
|
|
current_title = ""
|
|
|
|
|
|
current_body_lines = []
|
|
|
|
|
|
|
|
|
|
|
|
def _is_title(stripped, idx):
|
|
|
|
|
|
return (stripped and
|
|
|
|
|
|
len(stripped) < 60 and
|
|
|
|
|
|
stripped == stripped.upper() and
|
|
|
|
|
|
re.search(r"[A-Z]{3,}", stripped) and
|
|
|
|
|
|
not stripped.startswith("[") and
|
|
|
|
|
|
not stripped.startswith("BY ") and
|
|
|
|
|
|
not re.match(r"^\d+$", stripped) and
|
|
|
|
|
|
not re.match(r"^\d+[.\s]", stripped) and
|
|
|
|
|
|
not re.match(r"^[IVXLC]+\.$", stripped) and
|
|
|
|
|
|
not stripped.startswith("***") and
|
|
|
|
|
|
idx > 0 and not lines_list[idx - 1].strip())
|
|
|
|
|
|
|
|
|
|
|
|
def _save_current():
|
|
|
|
|
|
if current_title and current_body_lines:
|
|
|
|
|
|
body_text = "\n".join(current_body_lines)
|
|
|
|
|
|
cleaned = body_text.strip()
|
|
|
|
|
|
if (not re.match(r"^[\s*]+$", cleaned)
|
|
|
|
|
|
and len(cleaned) >= 50):
|
|
|
|
|
|
p = make_poem(current_title, body_text,
|
|
|
|
|
|
"Edgar Allan Poe",
|
|
|
|
|
|
"Complete Poetical Works of Edgar Allan Poe",
|
|
|
|
|
|
"1827\u20131849")
|
|
|
|
|
|
if p:
|
|
|
|
|
|
poems.append(p)
|
|
|
|
|
|
|
|
|
|
|
|
def _norm(s):
|
|
|
|
|
|
return re.sub(r"[.,:;\"\'-]", "", s).strip()
|
Add poetry collection: 3,155 poems from 15 Gutenberg sources
New files:
- download_poetry.py: Download/parse script with 15 extractors
- poetry/*.json: Pre-parsed poetry from Project Gutenberg
Poets included:
Shakespeare (154), Dickinson (439), Whitman (383),
Blake (43), Keats (10), Poe (108), E.B. Browning (44),
T.S. Eliot (5), Frost (82), Yeats (48), Khayyam (176),
Burns (563), Wordsworth (51), Shelley (1049)
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 week ago
|
|
|
|
|
|
|
|
|
|
i = 0
|
|
|
|
|
|
while i < len(lines_list):
|
|
|
|
|
|
line = lines_list[i]
|
Add poetry collection: 3,155 poems from 15 Gutenberg sources
New files:
- download_poetry.py: Download/parse script with 15 extractors
- poetry/*.json: Pre-parsed poetry from Project Gutenberg
Poets included:
Shakespeare (154), Dickinson (439), Whitman (383),
Blake (43), Keats (10), Poe (108), E.B. Browning (44),
T.S. Eliot (5), Frost (82), Yeats (48), Khayyam (176),
Burns (563), Wordsworth (51), Shelley (1049)
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 week ago
|
|
|
|
stripped = line.strip()
|
|
|
|
|
|
|
|
|
|
|
|
if _is_title(stripped, i):
|
|
|
|
|
|
clean = stripped.rstrip(".,:;").rstrip()
|
|
|
|
|
|
normed = _norm(stripped)
|
Add poetry collection: 3,155 poems from 15 Gutenberg sources
New files:
- download_poetry.py: Download/parse script with 15 extractors
- poetry/*.json: Pre-parsed poetry from Project Gutenberg
Poets included:
Shakespeare (154), Dickinson (439), Whitman (383),
Blake (43), Keats (10), Poe (108), E.B. Browning (44),
T.S. Eliot (5), Frost (82), Yeats (48), Khayyam (176),
Burns (563), Wordsworth (51), Shelley (1049)
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 week ago
|
|
|
|
|
|
|
|
|
|
# Check for poem section header
|
|
|
|
|
|
if clean in poem_sections or normed in {_norm(s) for s in poem_sections}:
|
|
|
|
|
|
_save_current()
|
|
|
|
|
|
in_poem_section = True
|
Add poetry collection: 3,155 poems from 15 Gutenberg sources
New files:
- download_poetry.py: Download/parse script with 15 extractors
- poetry/*.json: Pre-parsed poetry from Project Gutenberg
Poets included:
Shakespeare (154), Dickinson (439), Whitman (383),
Blake (43), Keats (10), Poe (108), E.B. Browning (44),
T.S. Eliot (5), Frost (82), Yeats (48), Khayyam (176),
Burns (563), Wordsworth (51), Shelley (1049)
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 week ago
|
|
|
|
current_title = ""
|
|
|
|
|
|
current_body_lines = []
|
|
|
|
|
|
i += 1
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
|
|
# Check for non-poem section header
|
|
|
|
|
|
if clean in non_poem_sections or normed in {_norm(s) for s in non_poem_sections}:
|
|
|
|
|
|
_save_current()
|
|
|
|
|
|
in_poem_section = False
|
|
|
|
|
|
current_title = ""
|
|
|
|
|
|
current_body_lines = []
|
|
|
|
|
|
i += 1
|
|
|
|
|
|
continue
|
Add poetry collection: 3,155 poems from 15 Gutenberg sources
New files:
- download_poetry.py: Download/parse script with 15 extractors
- poetry/*.json: Pre-parsed poetry from Project Gutenberg
Poets included:
Shakespeare (154), Dickinson (439), Whitman (383),
Blake (43), Keats (10), Poe (108), E.B. Browning (44),
T.S. Eliot (5), Frost (82), Yeats (48), Khayyam (176),
Burns (563), Wordsworth (51), Shelley (1049)
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 week ago
|
|
|
|
|
|
|
|
|
|
# Within a poem section
|
|
|
|
|
|
if in_poem_section:
|
|
|
|
|
|
# Skip certain sub-headings without starting a poem
|
|
|
|
|
|
if normed in {_norm(s) for s in skip_titles}:
|
|
|
|
|
|
_save_current()
|
|
|
|
|
|
current_title = ""
|
|
|
|
|
|
current_body_lines = []
|
|
|
|
|
|
i += 1
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
|
|
# New poem title
|
|
|
|
|
|
_save_current()
|
|
|
|
|
|
title = stripped.title()
|
|
|
|
|
|
# Give "Part I" / "Part Ii" proper names
|
|
|
|
|
|
if re.match(r"Part [Ii]+\.", title):
|
|
|
|
|
|
title = "Al Aaraaf — " + title
|
|
|
|
|
|
current_title = title
|
|
|
|
|
|
current_body_lines = []
|
|
|
|
|
|
i += 1
|
|
|
|
|
|
continue
|
Add poetry collection: 3,155 poems from 15 Gutenberg sources
New files:
- download_poetry.py: Download/parse script with 15 extractors
- poetry/*.json: Pre-parsed poetry from Project Gutenberg
Poets included:
Shakespeare (154), Dickinson (439), Whitman (383),
Blake (43), Keats (10), Poe (108), E.B. Browning (44),
T.S. Eliot (5), Frost (82), Yeats (48), Khayyam (176),
Burns (563), Wordsworth (51), Shelley (1049)
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 week ago
|
|
|
|
|
|
|
|
|
|
if in_poem_section and current_title:
|
Add poetry collection: 3,155 poems from 15 Gutenberg sources
New files:
- download_poetry.py: Download/parse script with 15 extractors
- poetry/*.json: Pre-parsed poetry from Project Gutenberg
Poets included:
Shakespeare (154), Dickinson (439), Whitman (383),
Blake (43), Keats (10), Poe (108), E.B. Browning (44),
T.S. Eliot (5), Frost (82), Yeats (48), Khayyam (176),
Burns (563), Wordsworth (51), Shelley (1049)
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 week ago
|
|
|
|
current_body_lines.append(line)
|
|
|
|
|
|
|
|
|
|
|
|
i += 1
|
|
|
|
|
|
|
|
|
|
|
|
_save_current()
|
Add poetry collection: 3,155 poems from 15 Gutenberg sources
New files:
- download_poetry.py: Download/parse script with 15 extractors
- poetry/*.json: Pre-parsed poetry from Project Gutenberg
Poets included:
Shakespeare (154), Dickinson (439), Whitman (383),
Blake (43), Keats (10), Poe (108), E.B. Browning (44),
T.S. Eliot (5), Frost (82), Yeats (48), Khayyam (176),
Burns (563), Wordsworth (51), Shelley (1049)
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 week ago
|
|
|
|
return poems
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def extract_browning_sonnets(text):
|
|
|
|
|
|
"""Sonnets from the Portuguese by Elizabeth Barrett Browning (Gutenberg 2002)."""
|
|
|
|
|
|
body = extract_body(text)
|
|
|
|
|
|
poems = []
|
|
|
|
|
|
|
|
|
|
|
|
# Split on Roman numeral headers
|
|
|
|
|
|
parts = re.split(r"\n\n\n+([IVXLC]+)\n\n\n", body)
|
|
|
|
|
|
|
|
|
|
|
|
for i in range(1, len(parts) - 1, 2):
|
|
|
|
|
|
numeral = parts[i].strip()
|
|
|
|
|
|
poem_body = parts[i + 1]
|
|
|
|
|
|
p = make_poem(
|
|
|
|
|
|
f"Sonnet {numeral}",
|
|
|
|
|
|
poem_body,
|
|
|
|
|
|
"Elizabeth Barrett Browning",
|
|
|
|
|
|
"Sonnets from the Portuguese",
|
|
|
|
|
|
"1850",
|
|
|
|
|
|
)
|
|
|
|
|
|
if p:
|
|
|
|
|
|
poems.append(p)
|
|
|
|
|
|
|
|
|
|
|
|
return poems
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def extract_eliot_wasteland(text):
|
|
|
|
|
|
"""The Waste Land by T.S. Eliot (Gutenberg 1321)."""
|
|
|
|
|
|
body = extract_body(text)
|
|
|
|
|
|
poems = []
|
|
|
|
|
|
|
|
|
|
|
|
section_defs = [
|
|
|
|
|
|
("I", "THE BURIAL OF THE DEAD"),
|
|
|
|
|
|
("II", "A GAME OF CHESS"),
|
|
|
|
|
|
("III", "THE FIRE SERMON"),
|
|
|
|
|
|
("IV", "DEATH BY WATER"),
|
|
|
|
|
|
("V", "WHAT THE THUNDER SAID"),
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
# Find each section's actual occurrence (not CONTENTS/NOTES)
|
|
|
|
|
|
# The actual poem sections have the text on a non-indented line followed by poem body
|
|
|
|
|
|
positions = []
|
|
|
|
|
|
for num, title in section_defs:
|
|
|
|
|
|
pattern = re.compile(
|
|
|
|
|
|
r"^\s+" + re.escape(num) + r"\.\s+" + re.escape(title) + r"\s*$",
|
|
|
|
|
|
re.MULTILINE,
|
|
|
|
|
|
)
|
|
|
|
|
|
matches = list(pattern.finditer(body))
|
|
|
|
|
|
# Use the second occurrence (first is CONTENTS)
|
|
|
|
|
|
if len(matches) >= 2:
|
|
|
|
|
|
positions.append((f"The Waste Land: {title.title()}", matches[1].start(), matches[1].end()))
|
|
|
|
|
|
elif matches:
|
|
|
|
|
|
positions.append((f"The Waste Land: {title.title()}", matches[0].start(), matches[0].end()))
|
|
|
|
|
|
|
|
|
|
|
|
# Find NOTES section (second occurrence, after the poem)
|
|
|
|
|
|
notes_matches = list(re.finditer(r"NOTES ON", body))
|
|
|
|
|
|
notes_pos = notes_matches[1].start() if len(notes_matches) >= 2 else (
|
|
|
|
|
|
notes_matches[0].start() if notes_matches else len(body))
|
|
|
|
|
|
|
|
|
|
|
|
for i, (title, sec_start, sec_text_start) in enumerate(positions):
|
|
|
|
|
|
end = positions[i + 1][1] if i + 1 < len(positions) else notes_pos
|
|
|
|
|
|
section = body[sec_text_start:end]
|
|
|
|
|
|
# Remove line numbers
|
|
|
|
|
|
section = re.sub(r"\s+\d+$", "", section, flags=re.MULTILINE)
|
|
|
|
|
|
|
|
|
|
|
|
p = make_poem(
|
|
|
|
|
|
title,
|
|
|
|
|
|
section,
|
|
|
|
|
|
"T.S. Eliot",
|
|
|
|
|
|
"The Waste Land",
|
|
|
|
|
|
"1922",
|
|
|
|
|
|
)
|
|
|
|
|
|
if p:
|
|
|
|
|
|
poems.append(p)
|
|
|
|
|
|
|
|
|
|
|
|
return poems
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def extract_frost_mountain(text):
|
|
|
|
|
|
"""Robert Frost's Mountain Interval (Gutenberg 29345)."""
|
|
|
|
|
|
body = extract_body(text)
|
|
|
|
|
|
poems = []
|
|
|
|
|
|
|
|
|
|
|
|
# Poems have titles in _italics_ markup or ALL CAPS
|
|
|
|
|
|
# Pattern: _TITLE_ or _Title_ on own line
|
|
|
|
|
|
lines = body.split("\n")
|
|
|
|
|
|
|
|
|
|
|
|
# Find start of poems (after CONTENTS)
|
|
|
|
|
|
start_idx = 0
|
|
|
|
|
|
for i, line in enumerate(lines):
|
|
|
|
|
|
if "_THE ROAD NOT TAKEN_" in line or "THE ROAD NOT TAKEN" in line:
|
|
|
|
|
|
start_idx = i
|
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
|
|
current_title = ""
|
|
|
|
|
|
current_body_lines = []
|
|
|
|
|
|
|
|
|
|
|
|
i = start_idx
|
|
|
|
|
|
while i < len(lines):
|
|
|
|
|
|
line = lines[i]
|
|
|
|
|
|
stripped = line.strip()
|
|
|
|
|
|
|
|
|
|
|
|
# Title pattern: _TITLE_ or just an ALL CAPS short line
|
|
|
|
|
|
title_match = re.match(r"^_([A-Z][A-Z ,'!?:\-.—\"]+)_$", stripped)
|
|
|
|
|
|
if not title_match:
|
|
|
|
|
|
# Also match non-italic titles
|
|
|
|
|
|
if (stripped and len(stripped) < 60 and
|
|
|
|
|
|
stripped == stripped.upper() and
|
|
|
|
|
|
re.search(r"[A-Z]{3,}", stripped) and
|
|
|
|
|
|
not stripped.startswith("[") and
|
|
|
|
|
|
i > 0 and not lines[i-1].strip()):
|
|
|
|
|
|
title_match = True
|
|
|
|
|
|
title_text = stripped.title()
|
|
|
|
|
|
else:
|
|
|
|
|
|
title_match = None
|
|
|
|
|
|
else:
|
|
|
|
|
|
title_text = title_match.group(1).title()
|
|
|
|
|
|
title_match = True
|
|
|
|
|
|
|
|
|
|
|
|
if title_match:
|
|
|
|
|
|
# Check for subtitle on next line
|
|
|
|
|
|
sub_title = ""
|
|
|
|
|
|
if i + 1 < len(lines) and lines[i + 1].strip():
|
|
|
|
|
|
sub = lines[i + 1].strip()
|
|
|
|
|
|
# Subtitles like "I LONELINESS--HER WORD"
|
|
|
|
|
|
if re.match(r"^_?[IVX]+\s", sub) or re.match(r"^\d+\s", sub):
|
|
|
|
|
|
pass # numbered sub-parts, don't treat as subtitle
|
|
|
|
|
|
|
|
|
|
|
|
if current_title and current_body_lines:
|
|
|
|
|
|
p = make_poem(
|
|
|
|
|
|
current_title,
|
|
|
|
|
|
"\n".join(current_body_lines),
|
|
|
|
|
|
"Robert Frost",
|
|
|
|
|
|
"Mountain Interval",
|
|
|
|
|
|
"1916",
|
|
|
|
|
|
)
|
|
|
|
|
|
if p:
|
|
|
|
|
|
poems.append(p)
|
|
|
|
|
|
|
|
|
|
|
|
current_title = title_text
|
|
|
|
|
|
current_body_lines = []
|
|
|
|
|
|
i += 1
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
|
|
if current_title:
|
|
|
|
|
|
current_body_lines.append(line)
|
|
|
|
|
|
|
|
|
|
|
|
i += 1
|
|
|
|
|
|
|
|
|
|
|
|
if current_title and current_body_lines:
|
|
|
|
|
|
p = make_poem(
|
|
|
|
|
|
current_title,
|
|
|
|
|
|
"\n".join(current_body_lines),
|
|
|
|
|
|
"Robert Frost",
|
|
|
|
|
|
"Mountain Interval",
|
|
|
|
|
|
"1916",
|
|
|
|
|
|
)
|
|
|
|
|
|
if p:
|
|
|
|
|
|
poems.append(p)
|
|
|
|
|
|
|
|
|
|
|
|
return poems
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def extract_frost_selected(text):
|
|
|
|
|
|
"""Robert Frost's Selected Poems (Gutenberg 59824)."""
|
|
|
|
|
|
body = extract_body(text)
|
|
|
|
|
|
poems = []
|
|
|
|
|
|
|
|
|
|
|
|
lines = body.split("\n")
|
|
|
|
|
|
|
|
|
|
|
|
# Find start of poems
|
|
|
|
|
|
start_idx = 0
|
|
|
|
|
|
for i, line in enumerate(lines):
|
|
|
|
|
|
if "THE PASTURE" in line and i > 50:
|
|
|
|
|
|
start_idx = i
|
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
|
|
current_title = ""
|
|
|
|
|
|
current_body_lines = []
|
|
|
|
|
|
|
|
|
|
|
|
i = start_idx
|
|
|
|
|
|
while i < len(lines):
|
|
|
|
|
|
line = lines[i]
|
|
|
|
|
|
stripped = line.strip()
|
|
|
|
|
|
|
|
|
|
|
|
# Section dividers like "I", "II", etc. (Roman numerals alone)
|
|
|
|
|
|
if re.match(r"^[IVX]+$", stripped) and i > 0 and not lines[i-1].strip():
|
|
|
|
|
|
i += 1
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
|
|
# Title: ALL CAPS, short, after blank line
|
|
|
|
|
|
if (stripped and len(stripped) < 70 and
|
|
|
|
|
|
stripped == stripped.upper() and
|
|
|
|
|
|
re.search(r"[A-Z]{3,}", stripped) and
|
|
|
|
|
|
not stripped.startswith("[") and
|
|
|
|
|
|
not re.match(r"^[IVX]+$", stripped) and
|
|
|
|
|
|
i > 0 and not lines[i-1].strip()):
|
|
|
|
|
|
|
|
|
|
|
|
if current_title and current_body_lines:
|
|
|
|
|
|
p = make_poem(
|
|
|
|
|
|
current_title,
|
|
|
|
|
|
"\n".join(current_body_lines),
|
|
|
|
|
|
"Robert Frost",
|
|
|
|
|
|
"Selected Poems",
|
|
|
|
|
|
"1913–1916",
|
|
|
|
|
|
)
|
|
|
|
|
|
if p:
|
|
|
|
|
|
poems.append(p)
|
|
|
|
|
|
|
|
|
|
|
|
current_title = stripped.title()
|
|
|
|
|
|
current_body_lines = []
|
|
|
|
|
|
i += 1
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
|
|
if current_title:
|
|
|
|
|
|
current_body_lines.append(line)
|
|
|
|
|
|
|
|
|
|
|
|
i += 1
|
|
|
|
|
|
|
|
|
|
|
|
if current_title and current_body_lines:
|
|
|
|
|
|
p = make_poem(
|
|
|
|
|
|
current_title,
|
|
|
|
|
|
"\n".join(current_body_lines),
|
|
|
|
|
|
"Robert Frost",
|
|
|
|
|
|
"Selected Poems",
|
|
|
|
|
|
"1913–1916",
|
|
|
|
|
|
)
|
|
|
|
|
|
if p:
|
|
|
|
|
|
poems.append(p)
|
|
|
|
|
|
|
|
|
|
|
|
return poems
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def extract_yeats(text):
|
|
|
|
|
|
"""W.B. Yeats' The Wind Among the Reeds (Gutenberg 32233)."""
|
|
|
|
|
|
body = extract_body(text)
|
|
|
|
|
|
poems = []
|
|
|
|
|
|
|
|
|
|
|
|
# Poems have ALL CAPS titles separated by blank lines
|
|
|
|
|
|
# After CONTENTS section and dedication
|
|
|
|
|
|
# Find first poem
|
|
|
|
|
|
idx = body.find("THE HOSTING OF THE SIDHE\n")
|
|
|
|
|
|
if idx == -1:
|
|
|
|
|
|
return poems
|
|
|
|
|
|
|
|
|
|
|
|
# Search for the second occurrence (after CONTENTS)
|
|
|
|
|
|
idx2 = body.find("THE HOSTING OF THE SIDHE\n", idx + 10)
|
|
|
|
|
|
if idx2 != -1:
|
|
|
|
|
|
body = body[idx2:]
|
|
|
|
|
|
else:
|
|
|
|
|
|
body = body[idx:]
|
|
|
|
|
|
|
|
|
|
|
|
# Split on ALL CAPS title lines
|
|
|
|
|
|
lines = body.split("\n")
|
|
|
|
|
|
current_title = ""
|
|
|
|
|
|
current_body_lines = []
|
|
|
|
|
|
|
|
|
|
|
|
for i, line in enumerate(lines):
|
|
|
|
|
|
stripped = line.strip()
|
|
|
|
|
|
|
|
|
|
|
|
# Title: ALL CAPS, after blank line
|
|
|
|
|
|
if (stripped and
|
|
|
|
|
|
len(stripped) < 80 and
|
|
|
|
|
|
stripped == stripped.upper() and
|
|
|
|
|
|
re.search(r"[A-Z]{3,}", stripped) and
|
|
|
|
|
|
not stripped.startswith("[") and
|
|
|
|
|
|
not stripped.startswith("NOTE") and
|
|
|
|
|
|
(i == 0 or not lines[i-1].strip())):
|
|
|
|
|
|
|
|
|
|
|
|
if current_title and current_body_lines:
|
|
|
|
|
|
p = make_poem(
|
|
|
|
|
|
current_title,
|
|
|
|
|
|
"\n".join(current_body_lines),
|
|
|
|
|
|
"W.B. Yeats",
|
|
|
|
|
|
"The Wind Among the Reeds",
|
|
|
|
|
|
"1899",
|
|
|
|
|
|
)
|
|
|
|
|
|
if p:
|
|
|
|
|
|
poems.append(p)
|
|
|
|
|
|
|
|
|
|
|
|
current_title = stripped.title()
|
|
|
|
|
|
current_body_lines = []
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
|
|
if current_title:
|
|
|
|
|
|
current_body_lines.append(line)
|
|
|
|
|
|
|
|
|
|
|
|
if current_title and current_body_lines:
|
|
|
|
|
|
p = make_poem(
|
|
|
|
|
|
current_title,
|
|
|
|
|
|
"\n".join(current_body_lines),
|
|
|
|
|
|
"W.B. Yeats",
|
|
|
|
|
|
"The Wind Among the Reeds",
|
|
|
|
|
|
"1899",
|
|
|
|
|
|
)
|
|
|
|
|
|
if p:
|
|
|
|
|
|
poems.append(p)
|
|
|
|
|
|
|
|
|
|
|
|
return poems
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def extract_khayyam(text):
|
|
|
|
|
|
"""The Rubaiyat of Omar Khayyam (Gutenberg 246)."""
|
|
|
|
|
|
body = extract_body(text)
|
|
|
|
|
|
poems = []
|
|
|
|
|
|
|
|
|
|
|
|
# Extract both First and Fifth editions
|
|
|
|
|
|
for edition, label in [("First Edition", "First Edition"),
|
|
|
|
|
|
("Fifth Edition", "Fifth Edition")]:
|
|
|
|
|
|
# Find ALL occurrences and use the one that's followed by quatrains
|
|
|
|
|
|
positions = [m.start() for m in re.finditer(re.escape(edition), body)]
|
|
|
|
|
|
ed_start = None
|
|
|
|
|
|
for pos in positions:
|
|
|
|
|
|
# Check if this is followed by "\n\n\nI.\n\n" nearby
|
|
|
|
|
|
chunk = body[pos:pos+200]
|
|
|
|
|
|
if re.search(r"\n\n+I\.\n\n", chunk):
|
|
|
|
|
|
ed_start = pos
|
|
|
|
|
|
break
|
|
|
|
|
|
if ed_start is None:
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
|
|
ed_body = body[ed_start:]
|
|
|
|
|
|
# Find the end: next edition or Notes section (far away)
|
|
|
|
|
|
# Look for next major section boundary
|
|
|
|
|
|
end_match = re.search(r"\n\n\n\n\n(First|Fifth) Edition", ed_body[200:])
|
|
|
|
|
|
notes_match = re.search(r"\n\n\n\n\nNotes", ed_body[200:])
|
|
|
|
|
|
|
|
|
|
|
|
if end_match:
|
|
|
|
|
|
ed_body = ed_body[:end_match.start() + 200]
|
|
|
|
|
|
elif notes_match:
|
|
|
|
|
|
ed_body = ed_body[:notes_match.start() + 200]
|
|
|
|
|
|
|
|
|
|
|
|
# Quatrains numbered: "I.\n\n" with varying leading newlines
|
|
|
|
|
|
parts = re.split(r"\n\n+([IVXLC]+)\.\n\n", ed_body)
|
|
|
|
|
|
for i in range(1, len(parts) - 1, 2):
|
|
|
|
|
|
numeral = parts[i].strip()
|
|
|
|
|
|
quatrain = parts[i + 1]
|
|
|
|
|
|
p = make_poem(
|
|
|
|
|
|
f"Quatrain {numeral} ({label})",
|
|
|
|
|
|
quatrain,
|
|
|
|
|
|
"Omar Khayyam (trans. Edward FitzGerald)",
|
|
|
|
|
|
"The Rubaiyat of Omar Khayyam",
|
|
|
|
|
|
"11th–12th century",
|
|
|
|
|
|
)
|
|
|
|
|
|
if p:
|
|
|
|
|
|
poems.append(p)
|
|
|
|
|
|
|
|
|
|
|
|
return poems
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def extract_burns(text):
|
|
|
|
|
|
"""Poems and Songs of Robert Burns (Gutenberg 1279)."""
|
|
|
|
|
|
body = extract_body(text)
|
|
|
|
|
|
poems = []
|
|
|
|
|
|
|
|
|
|
|
|
lines = body.split("\n")
|
|
|
|
|
|
current_title = ""
|
|
|
|
|
|
current_body_lines = []
|
|
|
|
|
|
|
|
|
|
|
|
# Burns has a very large collection organized by year
|
|
|
|
|
|
# Titles are like "Song—Handsome Nell", "To A Mouse", etc.
|
|
|
|
|
|
# They appear after blank lines, are relatively short, and mixed case
|
|
|
|
|
|
|
|
|
|
|
|
skip_patterns = {"INTRODUCTORY NOTE", "GLOSSARY", "INDEX", "NOTES",
|
|
|
|
|
|
"APPENDIX", "CONTENTS", "PREFACE"}
|
|
|
|
|
|
|
|
|
|
|
|
i = 0
|
|
|
|
|
|
while i < len(lines):
|
|
|
|
|
|
line = lines[i]
|
|
|
|
|
|
stripped = line.strip()
|
|
|
|
|
|
|
|
|
|
|
|
# Year headings like "1771 - 1779" or "1780"
|
|
|
|
|
|
if re.match(r"^\d{4}(\s*[-–]\s*\d{4})?\s*$", stripped):
|
|
|
|
|
|
if current_title and current_body_lines:
|
|
|
|
|
|
p = make_poem(
|
|
|
|
|
|
current_title,
|
|
|
|
|
|
"\n".join(current_body_lines),
|
|
|
|
|
|
"Robert Burns",
|
|
|
|
|
|
"Poems and Songs of Robert Burns",
|
|
|
|
|
|
"1771–1796",
|
|
|
|
|
|
)
|
|
|
|
|
|
if p:
|
|
|
|
|
|
poems.append(p)
|
|
|
|
|
|
current_title = ""
|
|
|
|
|
|
current_body_lines = []
|
|
|
|
|
|
i += 1
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
|
|
# Title detection: non-blank, short-ish, after blank line,
|
|
|
|
|
|
# not all lowercase, contains at least one uppercase word
|
|
|
|
|
|
if (stripped and
|
|
|
|
|
|
len(stripped) < 80 and
|
|
|
|
|
|
not line.startswith(" ") and # Not indented poem body
|
|
|
|
|
|
i > 0 and not lines[i-1].strip() and
|
|
|
|
|
|
re.search(r"[A-Z][a-z]", stripped) and
|
|
|
|
|
|
not stripped.startswith("[") and
|
|
|
|
|
|
not stripped.startswith("Footnote")):
|
|
|
|
|
|
|
|
|
|
|
|
# Additional checks for Burns titles
|
|
|
|
|
|
is_title = False
|
|
|
|
|
|
|
|
|
|
|
|
# Song titles: "Song—", "Ballad—", etc.
|
|
|
|
|
|
if re.match(r"^(Song|Ballad|Epistle|Elegy|Epitaph|Ode|Address|Epigram|Extempore|Fragment|Prologue|Lament|Lines|Stanzas|Verses|Inscription)[\s—\-:]", stripped):
|
|
|
|
|
|
is_title = True
|
|
|
|
|
|
# Titles starting with "To " or "On "
|
|
|
|
|
|
elif re.match(r"^(To |On |The |A |My |Tam |Holy |Poor |Bonnie |Highland )", stripped):
|
|
|
|
|
|
is_title = True
|
|
|
|
|
|
# ALL CAPS titles
|
|
|
|
|
|
elif stripped == stripped.upper() and len(stripped) > 5:
|
|
|
|
|
|
is_title = True
|
|
|
|
|
|
# Titles with special chars
|
|
|
|
|
|
elif "—" in stripped or stripped.endswith(":"):
|
|
|
|
|
|
is_title = True
|
|
|
|
|
|
# Check if next non-blank line is indented (poem body)
|
|
|
|
|
|
elif i + 1 < len(lines):
|
|
|
|
|
|
j = i + 1
|
|
|
|
|
|
while j < len(lines) and not lines[j].strip():
|
|
|
|
|
|
j += 1
|
|
|
|
|
|
if j < len(lines) and (lines[j].startswith(" ") or
|
|
|
|
|
|
lines[j].strip() != lines[j].strip().upper()):
|
|
|
|
|
|
# Could be a title, check more
|
|
|
|
|
|
if not any(stripped.upper().startswith(s) for s in skip_patterns):
|
|
|
|
|
|
is_title = True
|
|
|
|
|
|
|
|
|
|
|
|
if is_title:
|
|
|
|
|
|
# Check for multi-line titles
|
|
|
|
|
|
full_title = stripped
|
|
|
|
|
|
j = i + 1
|
|
|
|
|
|
while (j < len(lines) and lines[j].strip() and
|
|
|
|
|
|
not lines[j].startswith(" ") and
|
|
|
|
|
|
len(lines[j].strip()) < 60):
|
|
|
|
|
|
# Might be continuation of title
|
|
|
|
|
|
next_stripped = lines[j].strip()
|
|
|
|
|
|
if re.match(r"^(On |In |To |By |At |For |Or |And )", next_stripped):
|
|
|
|
|
|
full_title += " " + next_stripped
|
|
|
|
|
|
j += 1
|
|
|
|
|
|
else:
|
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
|
|
if current_title and current_body_lines:
|
|
|
|
|
|
p = make_poem(
|
|
|
|
|
|
current_title,
|
|
|
|
|
|
"\n".join(current_body_lines),
|
|
|
|
|
|
"Robert Burns",
|
|
|
|
|
|
"Poems and Songs of Robert Burns",
|
|
|
|
|
|
"1771–1796",
|
|
|
|
|
|
)
|
|
|
|
|
|
if p:
|
|
|
|
|
|
poems.append(p)
|
|
|
|
|
|
|
|
|
|
|
|
current_title = full_title
|
|
|
|
|
|
current_body_lines = []
|
|
|
|
|
|
i = j
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
|
|
if current_title:
|
|
|
|
|
|
current_body_lines.append(line)
|
|
|
|
|
|
|
|
|
|
|
|
i += 1
|
|
|
|
|
|
|
|
|
|
|
|
if current_title and current_body_lines:
|
|
|
|
|
|
p = make_poem(
|
|
|
|
|
|
current_title,
|
|
|
|
|
|
"\n".join(current_body_lines),
|
|
|
|
|
|
"Robert Burns",
|
|
|
|
|
|
"Poems and Songs of Robert Burns",
|
|
|
|
|
|
"1771–1796",
|
|
|
|
|
|
)
|
|
|
|
|
|
if p:
|
|
|
|
|
|
poems.append(p)
|
|
|
|
|
|
|
|
|
|
|
|
return poems
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def extract_wordsworth(text):
|
|
|
|
|
|
"""Lyrical Ballads by William Wordsworth (Gutenberg 9622)."""
|
|
|
|
|
|
body = extract_body(text)
|
|
|
|
|
|
poems = []
|
|
|
|
|
|
|
|
|
|
|
|
# Skip CONTENTS
|
|
|
|
|
|
lines = body.split("\n")
|
|
|
|
|
|
|
|
|
|
|
|
# Find where poems start (after CONTENTS section)
|
|
|
|
|
|
start_idx = 0
|
|
|
|
|
|
for i, line in enumerate(lines):
|
|
|
|
|
|
stripped = line.strip()
|
|
|
|
|
|
if stripped == "THE RIME OF THE ANCYENT MARINERE" and i > 50:
|
|
|
|
|
|
start_idx = i
|
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
|
|
current_title = ""
|
|
|
|
|
|
current_body_lines = []
|
|
|
|
|
|
|
|
|
|
|
|
i = start_idx
|
|
|
|
|
|
while i < len(lines):
|
|
|
|
|
|
line = lines[i]
|
|
|
|
|
|
stripped = line.strip()
|
|
|
|
|
|
|
|
|
|
|
|
# Title: ALL CAPS, after blank line
|
|
|
|
|
|
if (stripped and
|
|
|
|
|
|
len(stripped) < 80 and
|
|
|
|
|
|
stripped == stripped.upper() and
|
|
|
|
|
|
re.search(r"[A-Z]{3,}", stripped) and
|
|
|
|
|
|
not stripped.startswith("[") and
|
|
|
|
|
|
not stripped.startswith("NOTE") and
|
|
|
|
|
|
not stripped.startswith("***") and
|
|
|
|
|
|
not stripped.startswith("PART ") and
|
|
|
|
|
|
(i == 0 or not lines[i-1].strip())):
|
|
|
|
|
|
|
|
|
|
|
|
if current_title and current_body_lines:
|
|
|
|
|
|
p = make_poem(
|
|
|
|
|
|
current_title,
|
|
|
|
|
|
"\n".join(current_body_lines),
|
|
|
|
|
|
"William Wordsworth",
|
|
|
|
|
|
"Lyrical Ballads",
|
|
|
|
|
|
"1798",
|
|
|
|
|
|
)
|
|
|
|
|
|
if p:
|
|
|
|
|
|
poems.append(p)
|
|
|
|
|
|
|
|
|
|
|
|
current_title = stripped.title()
|
|
|
|
|
|
current_body_lines = []
|
|
|
|
|
|
i += 1
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
|
|
if current_title:
|
|
|
|
|
|
current_body_lines.append(line)
|
|
|
|
|
|
|
|
|
|
|
|
i += 1
|
|
|
|
|
|
|
|
|
|
|
|
if current_title and current_body_lines:
|
|
|
|
|
|
p = make_poem(
|
|
|
|
|
|
current_title,
|
|
|
|
|
|
"\n".join(current_body_lines),
|
|
|
|
|
|
"William Wordsworth",
|
|
|
|
|
|
"Lyrical Ballads",
|
|
|
|
|
|
"1798",
|
|
|
|
|
|
)
|
|
|
|
|
|
if p:
|
|
|
|
|
|
poems.append(p)
|
|
|
|
|
|
|
|
|
|
|
|
return poems
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def extract_shelley(text):
|
|
|
|
|
|
"""Complete Poetical Works of Shelley (Gutenberg 4800)."""
|
|
|
|
|
|
body = extract_body(text)
|
|
|
|
|
|
poems = []
|
|
|
|
|
|
|
|
|
|
|
|
lines = body.split("\n")
|
|
|
|
|
|
current_title = ""
|
|
|
|
|
|
current_body_lines = []
|
|
|
|
|
|
|
|
|
|
|
|
skip_titles = {"CONTENTS", "NOTE", "NOTES", "PREFACE", "INTRODUCTION",
|
|
|
|
|
|
"APPENDIX", "DEDICATION", "ADVERTISEMENTS", "MEMOIR",
|
|
|
|
|
|
"POSTSCRIPT", "DRAMATIS PERSONAE", "INDEX",
|
|
|
|
|
|
"BIBLIOGRAPHY", "TABLE OF CONTENTS"}
|
|
|
|
|
|
# Skip editorial / prose content patterns
|
|
|
|
|
|
skip_prefixes = ("NOTE BY", "TO ", "INCLUDING", "EDITED", "THOMAS",
|
|
|
|
|
|
"MARY W", "LONDON", "POSTSCRIPT")
|
|
|
|
|
|
|
|
|
|
|
|
i = 0
|
|
|
|
|
|
while i < len(lines):
|
|
|
|
|
|
line = lines[i]
|
|
|
|
|
|
stripped = line.strip()
|
|
|
|
|
|
|
|
|
|
|
|
# Title: ALL CAPS, short, after blank line
|
|
|
|
|
|
if (stripped and
|
|
|
|
|
|
3 < len(stripped) < 70 and
|
|
|
|
|
|
stripped == stripped.upper() and
|
|
|
|
|
|
re.search(r"[A-Z]{3,}", stripped) and
|
|
|
|
|
|
not stripped.startswith("[") and
|
|
|
|
|
|
not stripped.startswith("***") and
|
|
|
|
|
|
not re.match(r"^(ACT|SCENE|PART)\s", stripped) and
|
|
|
|
|
|
not re.match(r"^[IVX]+\.$", stripped) and
|
|
|
|
|
|
not re.match(r"^\d+\.", stripped) and
|
|
|
|
|
|
(i == 0 or not lines[i-1].strip())):
|
|
|
|
|
|
|
|
|
|
|
|
title_word = stripped.split(".")[0].strip()
|
|
|
|
|
|
if title_word in skip_titles:
|
|
|
|
|
|
i += 1
|
|
|
|
|
|
continue
|
|
|
|
|
|
# Skip notes entries
|
|
|
|
|
|
if stripped.startswith("NOTES"):
|
|
|
|
|
|
i += 1
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
|
|
if current_title and current_body_lines:
|
|
|
|
|
|
poem_text = "\n".join(current_body_lines)
|
|
|
|
|
|
# Only keep poems with real verse content
|
|
|
|
|
|
# Skip TOC entries (mostly CANTO/CHAPTER lines) and notes
|
|
|
|
|
|
cleaned = poem_text.strip()
|
|
|
|
|
|
if (len(cleaned) > 200 and
|
|
|
|
|
|
not cleaned.startswith("PREFACE") and
|
|
|
|
|
|
not cleaned.startswith("CANTO") and
|
|
|
|
|
|
not re.match(r"^(PREFACE|DEDICATION|CANTO|PART|NOTE|_\d)", cleaned)):
|
|
|
|
|
|
p = make_poem(
|
|
|
|
|
|
current_title,
|
|
|
|
|
|
poem_text,
|
|
|
|
|
|
"Percy Bysshe Shelley",
|
|
|
|
|
|
"Complete Poetical Works of Shelley",
|
|
|
|
|
|
"1810–1822",
|
|
|
|
|
|
)
|
|
|
|
|
|
if p:
|
|
|
|
|
|
poems.append(p)
|
|
|
|
|
|
|
|
|
|
|
|
current_title = stripped.title()
|
|
|
|
|
|
# Skip editorial prefixes
|
|
|
|
|
|
if any(stripped.startswith(sp) for sp in skip_prefixes):
|
|
|
|
|
|
current_title = ""
|
|
|
|
|
|
current_body_lines = []
|
|
|
|
|
|
i += 1
|
|
|
|
|
|
continue
|
|
|
|
|
|
current_body_lines = []
|
|
|
|
|
|
i += 1
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
|
|
if current_title:
|
|
|
|
|
|
current_body_lines.append(line)
|
|
|
|
|
|
|
|
|
|
|
|
i += 1
|
|
|
|
|
|
|
|
|
|
|
|
if current_title and current_body_lines:
|
|
|
|
|
|
poem_text = "\n".join(current_body_lines)
|
|
|
|
|
|
cleaned = poem_text.strip()
|
|
|
|
|
|
if (len(cleaned) > 200 and
|
|
|
|
|
|
not cleaned.startswith("PREFACE") and
|
|
|
|
|
|
not cleaned.startswith("CANTO") and
|
|
|
|
|
|
not re.match(r"^(PREFACE|DEDICATION|CANTO|PART|NOTE|_\d)", cleaned)):
|
|
|
|
|
|
p = make_poem(
|
|
|
|
|
|
current_title,
|
|
|
|
|
|
poem_text,
|
|
|
|
|
|
"Percy Bysshe Shelley",
|
|
|
|
|
|
"Complete Poetical Works of Shelley",
|
|
|
|
|
|
"1810–1822",
|
|
|
|
|
|
)
|
|
|
|
|
|
if p:
|
|
|
|
|
|
poems.append(p)
|
|
|
|
|
|
|
|
|
|
|
|
return poems
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# ─── Sources ─────────────────────────────────────────────────────
|
|
|
|
|
|
|
|
|
|
|
|
SOURCES = [
|
|
|
|
|
|
{
|
|
|
|
|
|
"id": 1041,
|
|
|
|
|
|
"filename": "shakespeare_sonnets.json",
|
|
|
|
|
|
"title": "Shakespeare's Sonnets",
|
|
|
|
|
|
"author": "William Shakespeare",
|
|
|
|
|
|
"extractor": extract_shakespeare_sonnets,
|
|
|
|
|
|
},
|
|
|
|
|
|
{
|
|
|
|
|
|
"id": 12242,
|
|
|
|
|
|
"filename": "dickinson_poems.json",
|
|
|
|
|
|
"title": "Poems by Emily Dickinson",
|
|
|
|
|
|
"author": "Emily Dickinson",
|
|
|
|
|
|
"extractor": extract_dickinson,
|
|
|
|
|
|
},
|
|
|
|
|
|
{
|
|
|
|
|
|
"id": 1322,
|
|
|
|
|
|
"filename": "whitman_leaves_of_grass.json",
|
|
|
|
|
|
"title": "Leaves of Grass",
|
|
|
|
|
|
"author": "Walt Whitman",
|
|
|
|
|
|
"extractor": extract_whitman,
|
|
|
|
|
|
},
|
|
|
|
|
|
{
|
|
|
|
|
|
"id": 1934,
|
|
|
|
|
|
"filename": "blake_songs.json",
|
|
|
|
|
|
"title": "Songs of Innocence and of Experience",
|
|
|
|
|
|
"author": "William Blake",
|
|
|
|
|
|
"extractor": extract_blake,
|
|
|
|
|
|
},
|
|
|
|
|
|
{
|
|
|
|
|
|
"id": 23684,
|
|
|
|
|
|
"filename": "keats_poems_1820.json",
|
|
|
|
|
|
"title": "Poems Published in 1820",
|
|
|
|
|
|
"author": "John Keats",
|
|
|
|
|
|
"extractor": extract_keats,
|
|
|
|
|
|
},
|
|
|
|
|
|
{
|
|
|
|
|
|
"id": 10031,
|
|
|
|
|
|
"filename": "poe_poetical_works.json",
|
|
|
|
|
|
"title": "Complete Poetical Works",
|
|
|
|
|
|
"author": "Edgar Allan Poe",
|
|
|
|
|
|
"extractor": extract_poe,
|
|
|
|
|
|
},
|
|
|
|
|
|
{
|
|
|
|
|
|
"id": 2002,
|
|
|
|
|
|
"filename": "browning_sonnets_portuguese.json",
|
|
|
|
|
|
"title": "Sonnets from the Portuguese",
|
|
|
|
|
|
"author": "Elizabeth Barrett Browning",
|
|
|
|
|
|
"extractor": extract_browning_sonnets,
|
|
|
|
|
|
},
|
|
|
|
|
|
{
|
|
|
|
|
|
"id": 1321,
|
|
|
|
|
|
"filename": "eliot_waste_land.json",
|
|
|
|
|
|
"title": "The Waste Land",
|
|
|
|
|
|
"author": "T.S. Eliot",
|
|
|
|
|
|
"extractor": extract_eliot_wasteland,
|
|
|
|
|
|
},
|
|
|
|
|
|
{
|
|
|
|
|
|
"id": 29345,
|
|
|
|
|
|
"filename": "frost_mountain_interval.json",
|
|
|
|
|
|
"title": "Mountain Interval",
|
|
|
|
|
|
"author": "Robert Frost",
|
|
|
|
|
|
"extractor": extract_frost_mountain,
|
|
|
|
|
|
},
|
|
|
|
|
|
{
|
|
|
|
|
|
"id": 59824,
|
|
|
|
|
|
"filename": "frost_selected_poems.json",
|
|
|
|
|
|
"title": "Selected Poems",
|
|
|
|
|
|
"author": "Robert Frost",
|
|
|
|
|
|
"extractor": extract_frost_selected,
|
|
|
|
|
|
},
|
|
|
|
|
|
{
|
|
|
|
|
|
"id": 32233,
|
|
|
|
|
|
"filename": "yeats_wind_reeds.json",
|
|
|
|
|
|
"title": "The Wind Among the Reeds",
|
|
|
|
|
|
"author": "W.B. Yeats",
|
|
|
|
|
|
"extractor": extract_yeats,
|
|
|
|
|
|
},
|
|
|
|
|
|
{
|
|
|
|
|
|
"id": 246,
|
|
|
|
|
|
"filename": "khayyam_rubaiyat.json",
|
|
|
|
|
|
"title": "The Rubaiyat of Omar Khayyam",
|
|
|
|
|
|
"author": "Omar Khayyam",
|
|
|
|
|
|
"extractor": extract_khayyam,
|
|
|
|
|
|
},
|
|
|
|
|
|
{
|
|
|
|
|
|
"id": 1279,
|
|
|
|
|
|
"filename": "burns_poems_songs.json",
|
|
|
|
|
|
"title": "Poems and Songs",
|
|
|
|
|
|
"author": "Robert Burns",
|
|
|
|
|
|
"extractor": extract_burns,
|
|
|
|
|
|
},
|
|
|
|
|
|
{
|
|
|
|
|
|
"id": 9622,
|
|
|
|
|
|
"filename": "wordsworth_lyrical_ballads.json",
|
|
|
|
|
|
"title": "Lyrical Ballads",
|
|
|
|
|
|
"author": "William Wordsworth",
|
|
|
|
|
|
"extractor": extract_wordsworth,
|
|
|
|
|
|
},
|
|
|
|
|
|
{
|
|
|
|
|
|
"id": 4800,
|
|
|
|
|
|
"filename": "shelley_poetical_works.json",
|
|
|
|
|
|
"title": "Complete Poetical Works",
|
|
|
|
|
|
"author": "Percy Bysshe Shelley",
|
|
|
|
|
|
"extractor": extract_shelley,
|
|
|
|
|
|
},
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def download_source(source):
|
|
|
|
|
|
"""Download and parse one source."""
|
|
|
|
|
|
text = fetch_text(source["id"])
|
|
|
|
|
|
poems = source["extractor"](text)
|
|
|
|
|
|
print(f" Extracted {len(poems)} poems")
|
|
|
|
|
|
return poems
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def save_poems(poems, filename):
|
|
|
|
|
|
"""Save poems to JSON file."""
|
|
|
|
|
|
filepath = os.path.join(POETRY_DIR, filename)
|
|
|
|
|
|
with open(filepath, "w", encoding="utf-8") as f:
|
|
|
|
|
|
json.dump(poems, f, indent=2, ensure_ascii=False)
|
|
|
|
|
|
print(f" Saved to {filepath}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
|
|
|
if "--list" in sys.argv:
|
|
|
|
|
|
print("Available poetry sources:\n")
|
|
|
|
|
|
for s in SOURCES:
|
|
|
|
|
|
print(f" {s['author']:35s} {s['title']}")
|
|
|
|
|
|
print(f" {'':35s} Gutenberg #{s['id']}")
|
|
|
|
|
|
print()
|
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
|
|
os.makedirs(POETRY_DIR, exist_ok=True)
|
|
|
|
|
|
|
|
|
|
|
|
total = 0
|
|
|
|
|
|
for source in SOURCES:
|
|
|
|
|
|
print(f"\n{'='*60}")
|
|
|
|
|
|
print(f" {source['author']} — {source['title']}")
|
|
|
|
|
|
print(f" Gutenberg #{source['id']}")
|
|
|
|
|
|
print(f"{'='*60}")
|
|
|
|
|
|
try:
|
|
|
|
|
|
poems = download_source(source)
|
|
|
|
|
|
if poems:
|
|
|
|
|
|
save_poems(poems, source["filename"])
|
|
|
|
|
|
total += len(poems)
|
|
|
|
|
|
else:
|
|
|
|
|
|
print(" WARNING: No poems extracted!")
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
|
print(f" ERROR: {e}")
|
|
|
|
|
|
|
|
|
|
|
|
print(f"\n{'='*60}")
|
|
|
|
|
|
print(f" Total: {total} poems downloaded")
|
|
|
|
|
|
print(f"{'='*60}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
|
main()
|