You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1295 lines
40 KiB
Python

#!/usr/bin/env python3
"""Download and parse poetry collections from Project Gutenberg.
This is a maintainer-only tool. End users should use the pre-downloaded
poetry files in the poetry/ directory.
Usage:
python download_poetry.py # Download all sources
python download_poetry.py --list # List available sources
"""
import json
import os
import re
import sys
import urllib.request
POETRY_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "poetry")
GUTENBERG_URL = "https://www.gutenberg.org/cache/epub/{id}/pg{id}.txt"
def fetch_text(gutenberg_id):
"""Download and return cleaned text from Project Gutenberg."""
url = GUTENBERG_URL.format(id=gutenberg_id)
print(f" Downloading {url} ...")
req = urllib.request.Request(url, headers={"User-Agent": "PoetryDownloader/1.0"})
with urllib.request.urlopen(req, timeout=30) as resp:
raw = resp.read().decode("utf-8-sig")
# Normalize line endings
raw = raw.replace("\r\n", "\n").replace("\r", "\n")
return raw
def extract_body(text):
"""Extract text between Gutenberg START/END markers."""
start = re.search(r"\*\*\*\s*START OF.*?\*\*\*", text)
end = re.search(r"\*\*\*\s*END OF.*?\*\*\*", text)
if start and end:
return text[start.end():end.start()]
if start:
return text[start.end():]
return text
def clean_poem(text):
"""Clean whitespace from a poem body."""
lines = text.split("\n")
# Strip trailing whitespace from each line
lines = [l.rstrip() for l in lines]
# Remove leading/trailing blank lines
while lines and not lines[0].strip():
lines.pop(0)
while lines and not lines[-1].strip():
lines.pop()
return "\n".join(lines)
def make_poem(title, body, author, source, period):
"""Create a poem dict, skipping empty poems."""
body = clean_poem(body)
if not body or len(body.strip()) < 20:
return None
return {
"title": title.strip(),
"body": body,
"author": author,
"source": source,
"period": period,
}
# ─── Extractors ─────────────────────────────────────────────────
def extract_shakespeare_sonnets(text):
"""Shakespeare's Sonnets (Gutenberg 1041)."""
body = extract_body(text)
poems = []
# Sonnets are separated by Roman numeral on its own line with double newlines
parts = re.split(r"\n\n([IVXLC]+)\n\n", body)
# parts[0] is intro, then alternating: numeral, body
for i in range(1, len(parts) - 1, 2):
numeral = parts[i].strip()
poem_body = parts[i + 1]
p = make_poem(
f"Sonnet {numeral}",
poem_body,
"William Shakespeare",
"Shakespeare's Sonnets",
"1609",
)
if p:
poems.append(p)
return poems
def extract_dickinson(text):
"""Emily Dickinson's Poems, Three Series (Gutenberg 12242)."""
body = extract_body(text)
poems = []
# Skip the preface - find first poem section
# Poems are grouped by topic headings (e.g., "I. LIFE.", "II. LOVE.")
# Individual poems have titles in ALL CAPS or Roman numerals
# Split into series sections
# Each poem is preceded by a blank line and a title line (usually short, often ALL CAPS)
# The structure is: section headings, then poems with short titles
# Strategy: find patterns like title lines followed by poem bodies
lines = body.split("\n")
current_title = ""
current_body_lines = []
in_poem = False
section = ""
# Skip to after TRANSCRIBER'S NOTE section and the first poem
start_idx = 0
for i, line in enumerate(lines):
if line.strip() == "I. LIFE.":
start_idx = i
break
i = start_idx
while i < len(lines):
line = lines[i]
stripped = line.strip()
# Section headers like "I. LIFE.", "II. LOVE.", etc.
if re.match(r"^[IVX]+\.\s+[A-Z]+\.\s*$", stripped):
# Save previous poem
if current_title and current_body_lines:
p = make_poem(
current_title,
"\n".join(current_body_lines),
"Emily Dickinson",
"Poems by Emily Dickinson, Three Series",
"18901896",
)
if p:
poems.append(p)
current_title = ""
current_body_lines = []
section = stripped
i += 1
continue
# Series dividers
if "SERIES" in stripped and "SECOND" in stripped or "THIRD" in stripped:
if current_title and current_body_lines:
p = make_poem(
current_title,
"\n".join(current_body_lines),
"Emily Dickinson",
"Poems by Emily Dickinson, Three Series",
"18901896",
)
if p:
poems.append(p)
current_title = ""
current_body_lines = []
i += 1
continue
# Poem title: short line, mostly caps, after blank line
# Dickinson titles are typically short phrases in ALL CAPS
if (stripped and
len(stripped) < 60 and
not stripped.startswith("[") and
stripped == stripped.upper() and
re.search(r"[A-Z]{2,}", stripped) and
not re.match(r"^[IVX]+\.\s+[A-Z]+\.\s*$", stripped) and
i > 0 and not lines[i-1].strip()):
# Looks like a title - save previous poem
if current_title and current_body_lines:
p = make_poem(
current_title,
"\n".join(current_body_lines),
"Emily Dickinson",
"Poems by Emily Dickinson, Three Series",
"18901896",
)
if p:
poems.append(p)
current_title = stripped.title()
current_body_lines = []
in_poem = True
i += 1
continue
if in_poem:
current_body_lines.append(line)
i += 1
# Don't forget last poem
if current_title and current_body_lines:
p = make_poem(
current_title,
"\n".join(current_body_lines),
"Emily Dickinson",
"Poems by Emily Dickinson, Three Series",
"18901896",
)
if p:
poems.append(p)
return poems
def extract_whitman(text):
"""Walt Whitman's Leaves of Grass (Gutenberg 1322)."""
body = extract_body(text)
poems = []
# Poems have titles on their own lines, followed by poem text
# Some are prefixed with "BOOK I." etc.
# Title lines are typically short, not indented much
lines = body.split("\n")
# Skip the initial inscription
start_idx = 0
for i, line in enumerate(lines):
if "One's-Self I Sing" in line:
start_idx = i
break
current_title = ""
current_body_lines = []
# Whitman's poems: title is a non-indented, relatively short line
# followed by indented poem body
# Some titles span multiple lines (rare)
i = start_idx
while i < len(lines):
line = lines[i]
stripped = line.strip()
# Skip BOOK headers
if re.match(r"^BOOK\s+[IVXLC]+", stripped):
i += 1
continue
# Title detection: non-blank, mostly un-indented, short-ish
# Titles in Whitman are like "Song of Myself", "I Sing the Body Electric"
# They are followed by blank line then indented body
if (stripped and
len(stripped) < 80 and
not line.startswith(" ") and
not stripped.startswith("[") and
not stripped.startswith("BOOK") and
i > 0 and not lines[i-1].strip()):
# Check if next non-blank line is indented (poem body)
j = i + 1
while j < len(lines) and not lines[j].strip():
j += 1
if j < len(lines) and lines[j].startswith(" "):
# This is a title
if current_title and current_body_lines:
p = make_poem(
current_title,
"\n".join(current_body_lines),
"Walt Whitman",
"Leaves of Grass",
"18911892",
)
if p:
poems.append(p)
current_title = stripped
current_body_lines = []
i += 1
continue
if current_title:
current_body_lines.append(line)
i += 1
if current_title and current_body_lines:
p = make_poem(
current_title,
"\n".join(current_body_lines),
"Walt Whitman",
"Leaves of Grass",
"18911892",
)
if p:
poems.append(p)
return poems
def extract_blake(text):
"""William Blake's Songs of Innocence and of Experience (Gutenberg 1934)."""
body = extract_body(text)
poems = []
# Remove [Picture: ...] tags
body = re.sub(r"\[Picture:.*?\]", "", body, flags=re.DOTALL)
# Find all poem titles (ALL CAPS on their own line, 3+ newlines before)
# Build a list of (title, start_pos) then extract bodies between them
title_pattern = re.compile(r"\n\n\n([A-Z][A-Z ,'!?:;\-—]+)\n")
matches = list(title_pattern.finditer(body))
skip_titles = {"SONGS OF INNOCENCE", "SONGS OF EXPERIENCE", "CONTENTS",
"SONGS OF INNOCENCE AND OF EXPERIENCE"}
for idx, m in enumerate(matches):
title = m.group(1).strip()
if title in skip_titles:
continue
start = m.end()
end = matches[idx + 1].start() if idx + 1 < len(matches) else len(body)
poem_body = body[start:end]
p = make_poem(
title.title(),
poem_body,
"William Blake",
"Songs of Innocence and of Experience",
"17891794",
)
if p:
poems.append(p)
return poems
def extract_keats(text):
"""Keats: Poems Published in 1820 (Gutenberg 23684)."""
body = extract_body(text)
poems = []
# Extract the main poems using CONTENTS as a guide
# The poems are: LAMIA (Part I & II), ISABELLA, THE EVE OF ST. AGNES,
# various Odes, FANCY, HYPERION (Books I-III), etc.
# Followed by NOTES sections
# Find where actual poems start (after the CONTENTS and LIFE OF KEATS sections)
advert_match = re.search(r"\nADVERTISEMENT\.?\n", body)
if advert_match:
body = body[advert_match.start():]
# Split on major poem titles - these appear in ALL CAPS on their own
# We'll manually identify poem boundaries
poem_titles = [
("ADVERTISEMENT", "LAMIA"),
("LAMIA. PART I", "LAMIA. PART II"),
("LAMIA. PART II", "ISABELLA"),
("ISABELLA; OR, THE POT OF BASIL", "THE EVE OF ST. AGNES"),
("THE EVE OF ST. AGNES", "ODE TO A NIGHTINGALE"),
("ODE TO A NIGHTINGALE", "ODE ON A GRECIAN URN"),
("ODE ON A GRECIAN URN", "ODE TO PSYCHE"),
("ODE TO PSYCHE", "FANCY"),
("FANCY", "ODE"),
("ODE\n", "LINES ON THE MERMAID TAVERN"),
("LINES ON THE MERMAID TAVERN", "ROBIN HOOD"),
("ROBIN HOOD", "TO AUTUMN"),
("TO AUTUMN", "ODE ON MELANCHOLY"),
("ODE ON MELANCHOLY", "HYPERION"),
("HYPERION. BOOK I", "HYPERION. BOOK II"),
("HYPERION. BOOK II", "HYPERION. BOOK III"),
("HYPERION. BOOK III", "NOTE ON ADVERTISEMENT"),
]
for title_start, title_end in poem_titles:
start = body.find(title_start)
end = body.find(title_end, start + len(title_start)) if title_end else len(body)
if start == -1:
continue
if end == -1:
end = len(body)
section = body[start:end]
# Remove the title line itself
first_newline = section.find("\n")
if first_newline != -1:
poem_body = section[first_newline:]
else:
poem_body = section
# Clean title
display_title = title_start.replace(". PART ", ", Part ").title()
if display_title == "Advertisement":
continue
if display_title.startswith("Ode\n"):
display_title = "Ode (Bards of Passion and of Mirth)"
# Remove editorial line numbers
poem_body = re.sub(r"\s+\d+$", "", poem_body, flags=re.MULTILINE)
# Remove footnote markers
poem_body = re.sub(r"\[\d+\]", "", poem_body)
p = make_poem(
display_title,
poem_body,
"John Keats",
"Poems Published in 1820",
"1820",
)
if p:
poems.append(p)
return poems
def extract_poe(text):
"""Complete Poetical Works of Edgar Allan Poe (Gutenberg 10031)."""
body = extract_body(text)
poems = []
# Find the start of actual poems (after intro/contents)
# Poems have titles in ALL CAPS separated by blank lines
lines = body.split("\n")
current_title = ""
current_body_lines = []
skip_sections = {"CONTENTS", "NOTE", "NOTES", "INDEX", "APPENDIX",
"MEMOIR", "PREFACE", "INTRODUCTION", "BIBLIOGRAPHY"}
i = 0
while i < len(lines):
line = lines[i]
stripped = line.strip()
# Title detection: short ALL CAPS line after blank line
if (stripped and
len(stripped) < 60 and
stripped == stripped.upper() and
re.search(r"[A-Z]{3,}", stripped) and
not stripped.startswith("[") and
not stripped.startswith("BY ") and
not re.match(r"^\d+$", stripped) and
not stripped.startswith("***") and
i > 0 and not lines[i-1].strip()):
# Check if this is a section to skip
if stripped.split(".")[0].strip() in skip_sections:
if current_title and current_body_lines:
p = make_poem(
current_title,
"\n".join(current_body_lines),
"Edgar Allan Poe",
"Complete Poetical Works of Edgar Allan Poe",
"18271849",
)
if p:
poems.append(p)
current_title = ""
current_body_lines = []
i += 1
continue
# Save previous poem
if current_title and current_body_lines:
p = make_poem(
current_title,
"\n".join(current_body_lines),
"Edgar Allan Poe",
"Complete Poetical Works of Edgar Allan Poe",
"18271849",
)
if p:
poems.append(p)
current_title = stripped.title()
current_body_lines = []
i += 1
continue
if current_title:
current_body_lines.append(line)
i += 1
if current_title and current_body_lines:
p = make_poem(
current_title,
"\n".join(current_body_lines),
"Edgar Allan Poe",
"Complete Poetical Works of Edgar Allan Poe",
"18271849",
)
if p:
poems.append(p)
return poems
def extract_browning_sonnets(text):
"""Sonnets from the Portuguese by Elizabeth Barrett Browning (Gutenberg 2002)."""
body = extract_body(text)
poems = []
# Split on Roman numeral headers
parts = re.split(r"\n\n\n+([IVXLC]+)\n\n\n", body)
for i in range(1, len(parts) - 1, 2):
numeral = parts[i].strip()
poem_body = parts[i + 1]
p = make_poem(
f"Sonnet {numeral}",
poem_body,
"Elizabeth Barrett Browning",
"Sonnets from the Portuguese",
"1850",
)
if p:
poems.append(p)
return poems
def extract_eliot_wasteland(text):
"""The Waste Land by T.S. Eliot (Gutenberg 1321)."""
body = extract_body(text)
poems = []
section_defs = [
("I", "THE BURIAL OF THE DEAD"),
("II", "A GAME OF CHESS"),
("III", "THE FIRE SERMON"),
("IV", "DEATH BY WATER"),
("V", "WHAT THE THUNDER SAID"),
]
# Find each section's actual occurrence (not CONTENTS/NOTES)
# The actual poem sections have the text on a non-indented line followed by poem body
positions = []
for num, title in section_defs:
pattern = re.compile(
r"^\s+" + re.escape(num) + r"\.\s+" + re.escape(title) + r"\s*$",
re.MULTILINE,
)
matches = list(pattern.finditer(body))
# Use the second occurrence (first is CONTENTS)
if len(matches) >= 2:
positions.append((f"The Waste Land: {title.title()}", matches[1].start(), matches[1].end()))
elif matches:
positions.append((f"The Waste Land: {title.title()}", matches[0].start(), matches[0].end()))
# Find NOTES section (second occurrence, after the poem)
notes_matches = list(re.finditer(r"NOTES ON", body))
notes_pos = notes_matches[1].start() if len(notes_matches) >= 2 else (
notes_matches[0].start() if notes_matches else len(body))
for i, (title, sec_start, sec_text_start) in enumerate(positions):
end = positions[i + 1][1] if i + 1 < len(positions) else notes_pos
section = body[sec_text_start:end]
# Remove line numbers
section = re.sub(r"\s+\d+$", "", section, flags=re.MULTILINE)
p = make_poem(
title,
section,
"T.S. Eliot",
"The Waste Land",
"1922",
)
if p:
poems.append(p)
return poems
def extract_frost_mountain(text):
"""Robert Frost's Mountain Interval (Gutenberg 29345)."""
body = extract_body(text)
poems = []
# Poems have titles in _italics_ markup or ALL CAPS
# Pattern: _TITLE_ or _Title_ on own line
lines = body.split("\n")
# Find start of poems (after CONTENTS)
start_idx = 0
for i, line in enumerate(lines):
if "_THE ROAD NOT TAKEN_" in line or "THE ROAD NOT TAKEN" in line:
start_idx = i
break
current_title = ""
current_body_lines = []
i = start_idx
while i < len(lines):
line = lines[i]
stripped = line.strip()
# Title pattern: _TITLE_ or just an ALL CAPS short line
title_match = re.match(r"^_([A-Z][A-Z ,'!?:\-.—\"]+)_$", stripped)
if not title_match:
# Also match non-italic titles
if (stripped and len(stripped) < 60 and
stripped == stripped.upper() and
re.search(r"[A-Z]{3,}", stripped) and
not stripped.startswith("[") and
i > 0 and not lines[i-1].strip()):
title_match = True
title_text = stripped.title()
else:
title_match = None
else:
title_text = title_match.group(1).title()
title_match = True
if title_match:
# Check for subtitle on next line
sub_title = ""
if i + 1 < len(lines) and lines[i + 1].strip():
sub = lines[i + 1].strip()
# Subtitles like "I LONELINESS--HER WORD"
if re.match(r"^_?[IVX]+\s", sub) or re.match(r"^\d+\s", sub):
pass # numbered sub-parts, don't treat as subtitle
if current_title and current_body_lines:
p = make_poem(
current_title,
"\n".join(current_body_lines),
"Robert Frost",
"Mountain Interval",
"1916",
)
if p:
poems.append(p)
current_title = title_text
current_body_lines = []
i += 1
continue
if current_title:
current_body_lines.append(line)
i += 1
if current_title and current_body_lines:
p = make_poem(
current_title,
"\n".join(current_body_lines),
"Robert Frost",
"Mountain Interval",
"1916",
)
if p:
poems.append(p)
return poems
def extract_frost_selected(text):
"""Robert Frost's Selected Poems (Gutenberg 59824)."""
body = extract_body(text)
poems = []
lines = body.split("\n")
# Find start of poems
start_idx = 0
for i, line in enumerate(lines):
if "THE PASTURE" in line and i > 50:
start_idx = i
break
current_title = ""
current_body_lines = []
i = start_idx
while i < len(lines):
line = lines[i]
stripped = line.strip()
# Section dividers like "I", "II", etc. (Roman numerals alone)
if re.match(r"^[IVX]+$", stripped) and i > 0 and not lines[i-1].strip():
i += 1
continue
# Title: ALL CAPS, short, after blank line
if (stripped and len(stripped) < 70 and
stripped == stripped.upper() and
re.search(r"[A-Z]{3,}", stripped) and
not stripped.startswith("[") and
not re.match(r"^[IVX]+$", stripped) and
i > 0 and not lines[i-1].strip()):
if current_title and current_body_lines:
p = make_poem(
current_title,
"\n".join(current_body_lines),
"Robert Frost",
"Selected Poems",
"19131916",
)
if p:
poems.append(p)
current_title = stripped.title()
current_body_lines = []
i += 1
continue
if current_title:
current_body_lines.append(line)
i += 1
if current_title and current_body_lines:
p = make_poem(
current_title,
"\n".join(current_body_lines),
"Robert Frost",
"Selected Poems",
"19131916",
)
if p:
poems.append(p)
return poems
def extract_yeats(text):
"""W.B. Yeats' The Wind Among the Reeds (Gutenberg 32233)."""
body = extract_body(text)
poems = []
# Poems have ALL CAPS titles separated by blank lines
# After CONTENTS section and dedication
# Find first poem
idx = body.find("THE HOSTING OF THE SIDHE\n")
if idx == -1:
return poems
# Search for the second occurrence (after CONTENTS)
idx2 = body.find("THE HOSTING OF THE SIDHE\n", idx + 10)
if idx2 != -1:
body = body[idx2:]
else:
body = body[idx:]
# Split on ALL CAPS title lines
lines = body.split("\n")
current_title = ""
current_body_lines = []
for i, line in enumerate(lines):
stripped = line.strip()
# Title: ALL CAPS, after blank line
if (stripped and
len(stripped) < 80 and
stripped == stripped.upper() and
re.search(r"[A-Z]{3,}", stripped) and
not stripped.startswith("[") and
not stripped.startswith("NOTE") and
(i == 0 or not lines[i-1].strip())):
if current_title and current_body_lines:
p = make_poem(
current_title,
"\n".join(current_body_lines),
"W.B. Yeats",
"The Wind Among the Reeds",
"1899",
)
if p:
poems.append(p)
current_title = stripped.title()
current_body_lines = []
continue
if current_title:
current_body_lines.append(line)
if current_title and current_body_lines:
p = make_poem(
current_title,
"\n".join(current_body_lines),
"W.B. Yeats",
"The Wind Among the Reeds",
"1899",
)
if p:
poems.append(p)
return poems
def extract_khayyam(text):
"""The Rubaiyat of Omar Khayyam (Gutenberg 246)."""
body = extract_body(text)
poems = []
# Extract both First and Fifth editions
for edition, label in [("First Edition", "First Edition"),
("Fifth Edition", "Fifth Edition")]:
# Find ALL occurrences and use the one that's followed by quatrains
positions = [m.start() for m in re.finditer(re.escape(edition), body)]
ed_start = None
for pos in positions:
# Check if this is followed by "\n\n\nI.\n\n" nearby
chunk = body[pos:pos+200]
if re.search(r"\n\n+I\.\n\n", chunk):
ed_start = pos
break
if ed_start is None:
continue
ed_body = body[ed_start:]
# Find the end: next edition or Notes section (far away)
# Look for next major section boundary
end_match = re.search(r"\n\n\n\n\n(First|Fifth) Edition", ed_body[200:])
notes_match = re.search(r"\n\n\n\n\nNotes", ed_body[200:])
if end_match:
ed_body = ed_body[:end_match.start() + 200]
elif notes_match:
ed_body = ed_body[:notes_match.start() + 200]
# Quatrains numbered: "I.\n\n" with varying leading newlines
parts = re.split(r"\n\n+([IVXLC]+)\.\n\n", ed_body)
for i in range(1, len(parts) - 1, 2):
numeral = parts[i].strip()
quatrain = parts[i + 1]
p = make_poem(
f"Quatrain {numeral} ({label})",
quatrain,
"Omar Khayyam (trans. Edward FitzGerald)",
"The Rubaiyat of Omar Khayyam",
"11th12th century",
)
if p:
poems.append(p)
return poems
def extract_burns(text):
"""Poems and Songs of Robert Burns (Gutenberg 1279)."""
body = extract_body(text)
poems = []
lines = body.split("\n")
current_title = ""
current_body_lines = []
# Burns has a very large collection organized by year
# Titles are like "Song—Handsome Nell", "To A Mouse", etc.
# They appear after blank lines, are relatively short, and mixed case
skip_patterns = {"INTRODUCTORY NOTE", "GLOSSARY", "INDEX", "NOTES",
"APPENDIX", "CONTENTS", "PREFACE"}
i = 0
while i < len(lines):
line = lines[i]
stripped = line.strip()
# Year headings like "1771 - 1779" or "1780"
if re.match(r"^\d{4}(\s*[-]\s*\d{4})?\s*$", stripped):
if current_title and current_body_lines:
p = make_poem(
current_title,
"\n".join(current_body_lines),
"Robert Burns",
"Poems and Songs of Robert Burns",
"17711796",
)
if p:
poems.append(p)
current_title = ""
current_body_lines = []
i += 1
continue
# Title detection: non-blank, short-ish, after blank line,
# not all lowercase, contains at least one uppercase word
if (stripped and
len(stripped) < 80 and
not line.startswith(" ") and # Not indented poem body
i > 0 and not lines[i-1].strip() and
re.search(r"[A-Z][a-z]", stripped) and
not stripped.startswith("[") and
not stripped.startswith("Footnote")):
# Additional checks for Burns titles
is_title = False
# Song titles: "Song—", "Ballad—", etc.
if re.match(r"^(Song|Ballad|Epistle|Elegy|Epitaph|Ode|Address|Epigram|Extempore|Fragment|Prologue|Lament|Lines|Stanzas|Verses|Inscription)[\s—\-:]", stripped):
is_title = True
# Titles starting with "To " or "On "
elif re.match(r"^(To |On |The |A |My |Tam |Holy |Poor |Bonnie |Highland )", stripped):
is_title = True
# ALL CAPS titles
elif stripped == stripped.upper() and len(stripped) > 5:
is_title = True
# Titles with special chars
elif "" in stripped or stripped.endswith(":"):
is_title = True
# Check if next non-blank line is indented (poem body)
elif i + 1 < len(lines):
j = i + 1
while j < len(lines) and not lines[j].strip():
j += 1
if j < len(lines) and (lines[j].startswith(" ") or
lines[j].strip() != lines[j].strip().upper()):
# Could be a title, check more
if not any(stripped.upper().startswith(s) for s in skip_patterns):
is_title = True
if is_title:
# Check for multi-line titles
full_title = stripped
j = i + 1
while (j < len(lines) and lines[j].strip() and
not lines[j].startswith(" ") and
len(lines[j].strip()) < 60):
# Might be continuation of title
next_stripped = lines[j].strip()
if re.match(r"^(On |In |To |By |At |For |Or |And )", next_stripped):
full_title += " " + next_stripped
j += 1
else:
break
if current_title and current_body_lines:
p = make_poem(
current_title,
"\n".join(current_body_lines),
"Robert Burns",
"Poems and Songs of Robert Burns",
"17711796",
)
if p:
poems.append(p)
current_title = full_title
current_body_lines = []
i = j
continue
if current_title:
current_body_lines.append(line)
i += 1
if current_title and current_body_lines:
p = make_poem(
current_title,
"\n".join(current_body_lines),
"Robert Burns",
"Poems and Songs of Robert Burns",
"17711796",
)
if p:
poems.append(p)
return poems
def extract_wordsworth(text):
"""Lyrical Ballads by William Wordsworth (Gutenberg 9622)."""
body = extract_body(text)
poems = []
# Skip CONTENTS
lines = body.split("\n")
# Find where poems start (after CONTENTS section)
start_idx = 0
for i, line in enumerate(lines):
stripped = line.strip()
if stripped == "THE RIME OF THE ANCYENT MARINERE" and i > 50:
start_idx = i
break
current_title = ""
current_body_lines = []
i = start_idx
while i < len(lines):
line = lines[i]
stripped = line.strip()
# Title: ALL CAPS, after blank line
if (stripped and
len(stripped) < 80 and
stripped == stripped.upper() and
re.search(r"[A-Z]{3,}", stripped) and
not stripped.startswith("[") and
not stripped.startswith("NOTE") and
not stripped.startswith("***") and
not stripped.startswith("PART ") and
(i == 0 or not lines[i-1].strip())):
if current_title and current_body_lines:
p = make_poem(
current_title,
"\n".join(current_body_lines),
"William Wordsworth",
"Lyrical Ballads",
"1798",
)
if p:
poems.append(p)
current_title = stripped.title()
current_body_lines = []
i += 1
continue
if current_title:
current_body_lines.append(line)
i += 1
if current_title and current_body_lines:
p = make_poem(
current_title,
"\n".join(current_body_lines),
"William Wordsworth",
"Lyrical Ballads",
"1798",
)
if p:
poems.append(p)
return poems
def extract_shelley(text):
"""Complete Poetical Works of Shelley (Gutenberg 4800)."""
body = extract_body(text)
poems = []
lines = body.split("\n")
current_title = ""
current_body_lines = []
skip_titles = {"CONTENTS", "NOTE", "NOTES", "PREFACE", "INTRODUCTION",
"APPENDIX", "DEDICATION", "ADVERTISEMENTS", "MEMOIR",
"POSTSCRIPT", "DRAMATIS PERSONAE", "INDEX",
"BIBLIOGRAPHY", "TABLE OF CONTENTS"}
# Skip editorial / prose content patterns
skip_prefixes = ("NOTE BY", "TO ", "INCLUDING", "EDITED", "THOMAS",
"MARY W", "LONDON", "POSTSCRIPT")
i = 0
while i < len(lines):
line = lines[i]
stripped = line.strip()
# Title: ALL CAPS, short, after blank line
if (stripped and
3 < len(stripped) < 70 and
stripped == stripped.upper() and
re.search(r"[A-Z]{3,}", stripped) and
not stripped.startswith("[") and
not stripped.startswith("***") and
not re.match(r"^(ACT|SCENE|PART)\s", stripped) and
not re.match(r"^[IVX]+\.$", stripped) and
not re.match(r"^\d+\.", stripped) and
(i == 0 or not lines[i-1].strip())):
title_word = stripped.split(".")[0].strip()
if title_word in skip_titles:
i += 1
continue
# Skip notes entries
if stripped.startswith("NOTES"):
i += 1
continue
if current_title and current_body_lines:
poem_text = "\n".join(current_body_lines)
# Only keep poems with real verse content
# Skip TOC entries (mostly CANTO/CHAPTER lines) and notes
cleaned = poem_text.strip()
if (len(cleaned) > 200 and
not cleaned.startswith("PREFACE") and
not cleaned.startswith("CANTO") and
not re.match(r"^(PREFACE|DEDICATION|CANTO|PART|NOTE|_\d)", cleaned)):
p = make_poem(
current_title,
poem_text,
"Percy Bysshe Shelley",
"Complete Poetical Works of Shelley",
"18101822",
)
if p:
poems.append(p)
current_title = stripped.title()
# Skip editorial prefixes
if any(stripped.startswith(sp) for sp in skip_prefixes):
current_title = ""
current_body_lines = []
i += 1
continue
current_body_lines = []
i += 1
continue
if current_title:
current_body_lines.append(line)
i += 1
if current_title and current_body_lines:
poem_text = "\n".join(current_body_lines)
cleaned = poem_text.strip()
if (len(cleaned) > 200 and
not cleaned.startswith("PREFACE") and
not cleaned.startswith("CANTO") and
not re.match(r"^(PREFACE|DEDICATION|CANTO|PART|NOTE|_\d)", cleaned)):
p = make_poem(
current_title,
poem_text,
"Percy Bysshe Shelley",
"Complete Poetical Works of Shelley",
"18101822",
)
if p:
poems.append(p)
return poems
# ─── Sources ─────────────────────────────────────────────────────
SOURCES = [
{
"id": 1041,
"filename": "shakespeare_sonnets.json",
"title": "Shakespeare's Sonnets",
"author": "William Shakespeare",
"extractor": extract_shakespeare_sonnets,
},
{
"id": 12242,
"filename": "dickinson_poems.json",
"title": "Poems by Emily Dickinson",
"author": "Emily Dickinson",
"extractor": extract_dickinson,
},
{
"id": 1322,
"filename": "whitman_leaves_of_grass.json",
"title": "Leaves of Grass",
"author": "Walt Whitman",
"extractor": extract_whitman,
},
{
"id": 1934,
"filename": "blake_songs.json",
"title": "Songs of Innocence and of Experience",
"author": "William Blake",
"extractor": extract_blake,
},
{
"id": 23684,
"filename": "keats_poems_1820.json",
"title": "Poems Published in 1820",
"author": "John Keats",
"extractor": extract_keats,
},
{
"id": 10031,
"filename": "poe_poetical_works.json",
"title": "Complete Poetical Works",
"author": "Edgar Allan Poe",
"extractor": extract_poe,
},
{
"id": 2002,
"filename": "browning_sonnets_portuguese.json",
"title": "Sonnets from the Portuguese",
"author": "Elizabeth Barrett Browning",
"extractor": extract_browning_sonnets,
},
{
"id": 1321,
"filename": "eliot_waste_land.json",
"title": "The Waste Land",
"author": "T.S. Eliot",
"extractor": extract_eliot_wasteland,
},
{
"id": 29345,
"filename": "frost_mountain_interval.json",
"title": "Mountain Interval",
"author": "Robert Frost",
"extractor": extract_frost_mountain,
},
{
"id": 59824,
"filename": "frost_selected_poems.json",
"title": "Selected Poems",
"author": "Robert Frost",
"extractor": extract_frost_selected,
},
{
"id": 32233,
"filename": "yeats_wind_reeds.json",
"title": "The Wind Among the Reeds",
"author": "W.B. Yeats",
"extractor": extract_yeats,
},
{
"id": 246,
"filename": "khayyam_rubaiyat.json",
"title": "The Rubaiyat of Omar Khayyam",
"author": "Omar Khayyam",
"extractor": extract_khayyam,
},
{
"id": 1279,
"filename": "burns_poems_songs.json",
"title": "Poems and Songs",
"author": "Robert Burns",
"extractor": extract_burns,
},
{
"id": 9622,
"filename": "wordsworth_lyrical_ballads.json",
"title": "Lyrical Ballads",
"author": "William Wordsworth",
"extractor": extract_wordsworth,
},
{
"id": 4800,
"filename": "shelley_poetical_works.json",
"title": "Complete Poetical Works",
"author": "Percy Bysshe Shelley",
"extractor": extract_shelley,
},
]
def download_source(source):
"""Download and parse one source."""
text = fetch_text(source["id"])
poems = source["extractor"](text)
print(f" Extracted {len(poems)} poems")
return poems
def save_poems(poems, filename):
"""Save poems to JSON file."""
filepath = os.path.join(POETRY_DIR, filename)
with open(filepath, "w", encoding="utf-8") as f:
json.dump(poems, f, indent=2, ensure_ascii=False)
print(f" Saved to {filepath}")
def main():
if "--list" in sys.argv:
print("Available poetry sources:\n")
for s in SOURCES:
print(f" {s['author']:35s} {s['title']}")
print(f" {'':35s} Gutenberg #{s['id']}")
print()
return
os.makedirs(POETRY_DIR, exist_ok=True)
total = 0
for source in SOURCES:
print(f"\n{'='*60}")
print(f" {source['author']}{source['title']}")
print(f" Gutenberg #{source['id']}")
print(f"{'='*60}")
try:
poems = download_source(source)
if poems:
save_poems(poems, source["filename"])
total += len(poems)
else:
print(" WARNING: No poems extracted!")
except Exception as e:
print(f" ERROR: {e}")
print(f"\n{'='*60}")
print(f" Total: {total} poems downloaded")
print(f"{'='*60}")
if __name__ == "__main__":
main()