#!/usr/bin/env python3 """Download and parse poetry collections from Project Gutenberg. This is a maintainer-only tool. End users should use the pre-downloaded poetry files in the poetry/ directory. Usage: python download_poetry.py # Download all sources python download_poetry.py --list # List available sources """ import json import os import re import sys import urllib.request POETRY_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "poetry") GUTENBERG_URL = "https://www.gutenberg.org/cache/epub/{id}/pg{id}.txt" def fetch_text(gutenberg_id): """Download and return cleaned text from Project Gutenberg.""" url = GUTENBERG_URL.format(id=gutenberg_id) print(f" Downloading {url} ...") req = urllib.request.Request(url, headers={"User-Agent": "PoetryDownloader/1.0"}) with urllib.request.urlopen(req, timeout=30) as resp: raw = resp.read().decode("utf-8-sig") # Normalize line endings raw = raw.replace("\r\n", "\n").replace("\r", "\n") return raw def extract_body(text): """Extract text between Gutenberg START/END markers.""" start = re.search(r"\*\*\*\s*START OF.*?\*\*\*", text) end = re.search(r"\*\*\*\s*END OF.*?\*\*\*", text) if start and end: return text[start.end():end.start()] if start: return text[start.end():] return text def clean_poem(text): """Clean whitespace from a poem body.""" lines = text.split("\n") # Strip trailing whitespace from each line lines = [l.rstrip() for l in lines] # Remove leading/trailing blank lines while lines and not lines[0].strip(): lines.pop(0) while lines and not lines[-1].strip(): lines.pop() return "\n".join(lines) def make_poem(title, body, author, source, period): """Create a poem dict, skipping empty poems.""" body = clean_poem(body) if not body or len(body.strip()) < 20: return None return { "title": title.strip(), "body": body, "author": author, "source": source, "period": period, } # ─── Extractors ───────────────────────────────────────────────── def extract_shakespeare_sonnets(text): """Shakespeare's Sonnets (Gutenberg 1041).""" body = extract_body(text) poems = [] # Sonnets are separated by Roman numeral on its own line with double newlines parts = re.split(r"\n\n([IVXLC]+)\n\n", body) # parts[0] is intro, then alternating: numeral, body for i in range(1, len(parts) - 1, 2): numeral = parts[i].strip() poem_body = parts[i + 1] p = make_poem( f"Sonnet {numeral}", poem_body, "William Shakespeare", "Shakespeare's Sonnets", "1609", ) if p: poems.append(p) return poems def extract_dickinson(text): """Emily Dickinson's Poems, Three Series (Gutenberg 12242).""" body = extract_body(text) poems = [] # Skip the preface - find first poem section # Poems are grouped by topic headings (e.g., "I. LIFE.", "II. LOVE.") # Individual poems have titles in ALL CAPS or Roman numerals # Split into series sections # Each poem is preceded by a blank line and a title line (usually short, often ALL CAPS) # The structure is: section headings, then poems with short titles # Strategy: find patterns like title lines followed by poem bodies lines = body.split("\n") current_title = "" current_body_lines = [] in_poem = False section = "" # Skip to after TRANSCRIBER'S NOTE section and the first poem start_idx = 0 for i, line in enumerate(lines): if line.strip() == "I. LIFE.": start_idx = i break i = start_idx while i < len(lines): line = lines[i] stripped = line.strip() # Section headers like "I. LIFE.", "II. LOVE.", etc. if re.match(r"^[IVX]+\.\s+[A-Z]+\.\s*$", stripped): # Save previous poem if current_title and current_body_lines: p = make_poem( current_title, "\n".join(current_body_lines), "Emily Dickinson", "Poems by Emily Dickinson, Three Series", "1890–1896", ) if p: poems.append(p) current_title = "" current_body_lines = [] section = stripped i += 1 continue # Series dividers if "SERIES" in stripped and "SECOND" in stripped or "THIRD" in stripped: if current_title and current_body_lines: p = make_poem( current_title, "\n".join(current_body_lines), "Emily Dickinson", "Poems by Emily Dickinson, Three Series", "1890–1896", ) if p: poems.append(p) current_title = "" current_body_lines = [] i += 1 continue # Poem title: short line, mostly caps, after blank line # Dickinson titles are typically short phrases in ALL CAPS if (stripped and len(stripped) < 60 and not stripped.startswith("[") and stripped == stripped.upper() and re.search(r"[A-Z]{2,}", stripped) and not re.match(r"^[IVX]+\.\s+[A-Z]+\.\s*$", stripped) and i > 0 and not lines[i-1].strip()): # Looks like a title - save previous poem if current_title and current_body_lines: p = make_poem( current_title, "\n".join(current_body_lines), "Emily Dickinson", "Poems by Emily Dickinson, Three Series", "1890–1896", ) if p: poems.append(p) current_title = stripped.title() current_body_lines = [] in_poem = True i += 1 continue if in_poem: current_body_lines.append(line) i += 1 # Don't forget last poem if current_title and current_body_lines: p = make_poem( current_title, "\n".join(current_body_lines), "Emily Dickinson", "Poems by Emily Dickinson, Three Series", "1890–1896", ) if p: poems.append(p) return poems def extract_whitman(text): """Walt Whitman's Leaves of Grass (Gutenberg 1322).""" body = extract_body(text) poems = [] # Poems have titles on their own lines, followed by poem text # Some are prefixed with "BOOK I." etc. # Title lines are typically short, not indented much lines = body.split("\n") # Skip the initial inscription start_idx = 0 for i, line in enumerate(lines): if "One's-Self I Sing" in line: start_idx = i break current_title = "" current_body_lines = [] # Whitman's poems: title is a non-indented, relatively short line # followed by indented poem body # Some titles span multiple lines (rare) i = start_idx while i < len(lines): line = lines[i] stripped = line.strip() # Skip BOOK headers if re.match(r"^BOOK\s+[IVXLC]+", stripped): i += 1 continue # Title detection: non-blank, mostly un-indented, short-ish # Titles in Whitman are like "Song of Myself", "I Sing the Body Electric" # They are followed by blank line then indented body if (stripped and len(stripped) < 80 and not line.startswith(" ") and not stripped.startswith("[") and not stripped.startswith("BOOK") and i > 0 and not lines[i-1].strip()): # Check if next non-blank line is indented (poem body) j = i + 1 while j < len(lines) and not lines[j].strip(): j += 1 if j < len(lines) and lines[j].startswith(" "): # This is a title if current_title and current_body_lines: p = make_poem( current_title, "\n".join(current_body_lines), "Walt Whitman", "Leaves of Grass", "1891–1892", ) if p: poems.append(p) current_title = stripped current_body_lines = [] i += 1 continue if current_title: current_body_lines.append(line) i += 1 if current_title and current_body_lines: p = make_poem( current_title, "\n".join(current_body_lines), "Walt Whitman", "Leaves of Grass", "1891–1892", ) if p: poems.append(p) return poems def extract_blake(text): """William Blake's Songs of Innocence and of Experience (Gutenberg 1934).""" body = extract_body(text) poems = [] # Remove [Picture: ...] tags body = re.sub(r"\[Picture:.*?\]", "", body, flags=re.DOTALL) # Find all poem titles (ALL CAPS on their own line, 3+ newlines before) # Build a list of (title, start_pos) then extract bodies between them title_pattern = re.compile(r"\n\n\n([A-Z][A-Z ,'!?:;\-—]+)\n") matches = list(title_pattern.finditer(body)) skip_titles = {"SONGS OF INNOCENCE", "SONGS OF EXPERIENCE", "CONTENTS", "SONGS OF INNOCENCE AND OF EXPERIENCE"} for idx, m in enumerate(matches): title = m.group(1).strip() if title in skip_titles: continue start = m.end() end = matches[idx + 1].start() if idx + 1 < len(matches) else len(body) poem_body = body[start:end] p = make_poem( title.title(), poem_body, "William Blake", "Songs of Innocence and of Experience", "1789–1794", ) if p: poems.append(p) return poems def extract_keats(text): """Keats: Poems Published in 1820 (Gutenberg 23684).""" body = extract_body(text) poems = [] # Extract the main poems using CONTENTS as a guide # The poems are: LAMIA (Part I & II), ISABELLA, THE EVE OF ST. AGNES, # various Odes, FANCY, HYPERION (Books I-III), etc. # Followed by NOTES sections # Find where actual poems start (after the CONTENTS and LIFE OF KEATS sections) advert_match = re.search(r"\nADVERTISEMENT\.?\n", body) if advert_match: body = body[advert_match.start():] # Split on major poem titles - these appear in ALL CAPS on their own # We'll manually identify poem boundaries poem_titles = [ ("ADVERTISEMENT", "LAMIA"), ("LAMIA. PART I", "LAMIA. PART II"), ("LAMIA. PART II", "ISABELLA"), ("ISABELLA; OR, THE POT OF BASIL", "THE EVE OF ST. AGNES"), ("THE EVE OF ST. AGNES", "ODE TO A NIGHTINGALE"), ("ODE TO A NIGHTINGALE", "ODE ON A GRECIAN URN"), ("ODE ON A GRECIAN URN", "ODE TO PSYCHE"), ("ODE TO PSYCHE", "FANCY"), ("FANCY", "ODE"), ("ODE\n", "LINES ON THE MERMAID TAVERN"), ("LINES ON THE MERMAID TAVERN", "ROBIN HOOD"), ("ROBIN HOOD", "TO AUTUMN"), ("TO AUTUMN", "ODE ON MELANCHOLY"), ("ODE ON MELANCHOLY", "HYPERION"), ("HYPERION. BOOK I", "HYPERION. BOOK II"), ("HYPERION. BOOK II", "HYPERION. BOOK III"), ("HYPERION. BOOK III", "NOTE ON ADVERTISEMENT"), ] for title_start, title_end in poem_titles: start = body.find(title_start) end = body.find(title_end, start + len(title_start)) if title_end else len(body) if start == -1: continue if end == -1: end = len(body) section = body[start:end] # Remove the title line itself first_newline = section.find("\n") if first_newline != -1: poem_body = section[first_newline:] else: poem_body = section # Clean title display_title = title_start.replace(". PART ", ", Part ").title() if display_title == "Advertisement": continue if display_title.startswith("Ode\n"): display_title = "Ode (Bards of Passion and of Mirth)" # Remove editorial line numbers poem_body = re.sub(r"\s+\d+$", "", poem_body, flags=re.MULTILINE) # Remove footnote markers poem_body = re.sub(r"\[\d+\]", "", poem_body) p = make_poem( display_title, poem_body, "John Keats", "Poems Published in 1820", "1820", ) if p: poems.append(p) return poems def extract_poe(text): """Complete Poetical Works of Edgar Allan Poe (Gutenberg 10031).""" body = extract_body(text) poems = [] # Find the start of actual poems (after intro/contents) # Poems have titles in ALL CAPS separated by blank lines lines = body.split("\n") current_title = "" current_body_lines = [] skip_sections = {"CONTENTS", "NOTE", "NOTES", "INDEX", "APPENDIX", "MEMOIR", "PREFACE", "INTRODUCTION", "BIBLIOGRAPHY"} i = 0 while i < len(lines): line = lines[i] stripped = line.strip() # Title detection: short ALL CAPS line after blank line if (stripped and len(stripped) < 60 and stripped == stripped.upper() and re.search(r"[A-Z]{3,}", stripped) and not stripped.startswith("[") and not stripped.startswith("BY ") and not re.match(r"^\d+$", stripped) and not stripped.startswith("***") and i > 0 and not lines[i-1].strip()): # Check if this is a section to skip if stripped.split(".")[0].strip() in skip_sections: if current_title and current_body_lines: p = make_poem( current_title, "\n".join(current_body_lines), "Edgar Allan Poe", "Complete Poetical Works of Edgar Allan Poe", "1827–1849", ) if p: poems.append(p) current_title = "" current_body_lines = [] i += 1 continue # Save previous poem if current_title and current_body_lines: p = make_poem( current_title, "\n".join(current_body_lines), "Edgar Allan Poe", "Complete Poetical Works of Edgar Allan Poe", "1827–1849", ) if p: poems.append(p) current_title = stripped.title() current_body_lines = [] i += 1 continue if current_title: current_body_lines.append(line) i += 1 if current_title and current_body_lines: p = make_poem( current_title, "\n".join(current_body_lines), "Edgar Allan Poe", "Complete Poetical Works of Edgar Allan Poe", "1827–1849", ) if p: poems.append(p) return poems def extract_browning_sonnets(text): """Sonnets from the Portuguese by Elizabeth Barrett Browning (Gutenberg 2002).""" body = extract_body(text) poems = [] # Split on Roman numeral headers parts = re.split(r"\n\n\n+([IVXLC]+)\n\n\n", body) for i in range(1, len(parts) - 1, 2): numeral = parts[i].strip() poem_body = parts[i + 1] p = make_poem( f"Sonnet {numeral}", poem_body, "Elizabeth Barrett Browning", "Sonnets from the Portuguese", "1850", ) if p: poems.append(p) return poems def extract_eliot_wasteland(text): """The Waste Land by T.S. Eliot (Gutenberg 1321).""" body = extract_body(text) poems = [] section_defs = [ ("I", "THE BURIAL OF THE DEAD"), ("II", "A GAME OF CHESS"), ("III", "THE FIRE SERMON"), ("IV", "DEATH BY WATER"), ("V", "WHAT THE THUNDER SAID"), ] # Find each section's actual occurrence (not CONTENTS/NOTES) # The actual poem sections have the text on a non-indented line followed by poem body positions = [] for num, title in section_defs: pattern = re.compile( r"^\s+" + re.escape(num) + r"\.\s+" + re.escape(title) + r"\s*$", re.MULTILINE, ) matches = list(pattern.finditer(body)) # Use the second occurrence (first is CONTENTS) if len(matches) >= 2: positions.append((f"The Waste Land: {title.title()}", matches[1].start(), matches[1].end())) elif matches: positions.append((f"The Waste Land: {title.title()}", matches[0].start(), matches[0].end())) # Find NOTES section (second occurrence, after the poem) notes_matches = list(re.finditer(r"NOTES ON", body)) notes_pos = notes_matches[1].start() if len(notes_matches) >= 2 else ( notes_matches[0].start() if notes_matches else len(body)) for i, (title, sec_start, sec_text_start) in enumerate(positions): end = positions[i + 1][1] if i + 1 < len(positions) else notes_pos section = body[sec_text_start:end] # Remove line numbers section = re.sub(r"\s+\d+$", "", section, flags=re.MULTILINE) p = make_poem( title, section, "T.S. Eliot", "The Waste Land", "1922", ) if p: poems.append(p) return poems def extract_frost_mountain(text): """Robert Frost's Mountain Interval (Gutenberg 29345).""" body = extract_body(text) poems = [] # Poems have titles in _italics_ markup or ALL CAPS # Pattern: _TITLE_ or _Title_ on own line lines = body.split("\n") # Find start of poems (after CONTENTS) start_idx = 0 for i, line in enumerate(lines): if "_THE ROAD NOT TAKEN_" in line or "THE ROAD NOT TAKEN" in line: start_idx = i break current_title = "" current_body_lines = [] i = start_idx while i < len(lines): line = lines[i] stripped = line.strip() # Title pattern: _TITLE_ or just an ALL CAPS short line title_match = re.match(r"^_([A-Z][A-Z ,'!?:\-.—\"]+)_$", stripped) if not title_match: # Also match non-italic titles if (stripped and len(stripped) < 60 and stripped == stripped.upper() and re.search(r"[A-Z]{3,}", stripped) and not stripped.startswith("[") and i > 0 and not lines[i-1].strip()): title_match = True title_text = stripped.title() else: title_match = None else: title_text = title_match.group(1).title() title_match = True if title_match: # Check for subtitle on next line sub_title = "" if i + 1 < len(lines) and lines[i + 1].strip(): sub = lines[i + 1].strip() # Subtitles like "I LONELINESS--HER WORD" if re.match(r"^_?[IVX]+\s", sub) or re.match(r"^\d+\s", sub): pass # numbered sub-parts, don't treat as subtitle if current_title and current_body_lines: p = make_poem( current_title, "\n".join(current_body_lines), "Robert Frost", "Mountain Interval", "1916", ) if p: poems.append(p) current_title = title_text current_body_lines = [] i += 1 continue if current_title: current_body_lines.append(line) i += 1 if current_title and current_body_lines: p = make_poem( current_title, "\n".join(current_body_lines), "Robert Frost", "Mountain Interval", "1916", ) if p: poems.append(p) return poems def extract_frost_selected(text): """Robert Frost's Selected Poems (Gutenberg 59824).""" body = extract_body(text) poems = [] lines = body.split("\n") # Find start of poems start_idx = 0 for i, line in enumerate(lines): if "THE PASTURE" in line and i > 50: start_idx = i break current_title = "" current_body_lines = [] i = start_idx while i < len(lines): line = lines[i] stripped = line.strip() # Section dividers like "I", "II", etc. (Roman numerals alone) if re.match(r"^[IVX]+$", stripped) and i > 0 and not lines[i-1].strip(): i += 1 continue # Title: ALL CAPS, short, after blank line if (stripped and len(stripped) < 70 and stripped == stripped.upper() and re.search(r"[A-Z]{3,}", stripped) and not stripped.startswith("[") and not re.match(r"^[IVX]+$", stripped) and i > 0 and not lines[i-1].strip()): if current_title and current_body_lines: p = make_poem( current_title, "\n".join(current_body_lines), "Robert Frost", "Selected Poems", "1913–1916", ) if p: poems.append(p) current_title = stripped.title() current_body_lines = [] i += 1 continue if current_title: current_body_lines.append(line) i += 1 if current_title and current_body_lines: p = make_poem( current_title, "\n".join(current_body_lines), "Robert Frost", "Selected Poems", "1913–1916", ) if p: poems.append(p) return poems def extract_yeats(text): """W.B. Yeats' The Wind Among the Reeds (Gutenberg 32233).""" body = extract_body(text) poems = [] # Poems have ALL CAPS titles separated by blank lines # After CONTENTS section and dedication # Find first poem idx = body.find("THE HOSTING OF THE SIDHE\n") if idx == -1: return poems # Search for the second occurrence (after CONTENTS) idx2 = body.find("THE HOSTING OF THE SIDHE\n", idx + 10) if idx2 != -1: body = body[idx2:] else: body = body[idx:] # Split on ALL CAPS title lines lines = body.split("\n") current_title = "" current_body_lines = [] for i, line in enumerate(lines): stripped = line.strip() # Title: ALL CAPS, after blank line if (stripped and len(stripped) < 80 and stripped == stripped.upper() and re.search(r"[A-Z]{3,}", stripped) and not stripped.startswith("[") and not stripped.startswith("NOTE") and (i == 0 or not lines[i-1].strip())): if current_title and current_body_lines: p = make_poem( current_title, "\n".join(current_body_lines), "W.B. Yeats", "The Wind Among the Reeds", "1899", ) if p: poems.append(p) current_title = stripped.title() current_body_lines = [] continue if current_title: current_body_lines.append(line) if current_title and current_body_lines: p = make_poem( current_title, "\n".join(current_body_lines), "W.B. Yeats", "The Wind Among the Reeds", "1899", ) if p: poems.append(p) return poems def extract_khayyam(text): """The Rubaiyat of Omar Khayyam (Gutenberg 246).""" body = extract_body(text) poems = [] # Extract both First and Fifth editions for edition, label in [("First Edition", "First Edition"), ("Fifth Edition", "Fifth Edition")]: # Find ALL occurrences and use the one that's followed by quatrains positions = [m.start() for m in re.finditer(re.escape(edition), body)] ed_start = None for pos in positions: # Check if this is followed by "\n\n\nI.\n\n" nearby chunk = body[pos:pos+200] if re.search(r"\n\n+I\.\n\n", chunk): ed_start = pos break if ed_start is None: continue ed_body = body[ed_start:] # Find the end: next edition or Notes section (far away) # Look for next major section boundary end_match = re.search(r"\n\n\n\n\n(First|Fifth) Edition", ed_body[200:]) notes_match = re.search(r"\n\n\n\n\nNotes", ed_body[200:]) if end_match: ed_body = ed_body[:end_match.start() + 200] elif notes_match: ed_body = ed_body[:notes_match.start() + 200] # Quatrains numbered: "I.\n\n" with varying leading newlines parts = re.split(r"\n\n+([IVXLC]+)\.\n\n", ed_body) for i in range(1, len(parts) - 1, 2): numeral = parts[i].strip() quatrain = parts[i + 1] p = make_poem( f"Quatrain {numeral} ({label})", quatrain, "Omar Khayyam (trans. Edward FitzGerald)", "The Rubaiyat of Omar Khayyam", "11th–12th century", ) if p: poems.append(p) return poems def extract_burns(text): """Poems and Songs of Robert Burns (Gutenberg 1279).""" body = extract_body(text) poems = [] lines = body.split("\n") current_title = "" current_body_lines = [] # Burns has a very large collection organized by year # Titles are like "Song—Handsome Nell", "To A Mouse", etc. # They appear after blank lines, are relatively short, and mixed case skip_patterns = {"INTRODUCTORY NOTE", "GLOSSARY", "INDEX", "NOTES", "APPENDIX", "CONTENTS", "PREFACE"} i = 0 while i < len(lines): line = lines[i] stripped = line.strip() # Year headings like "1771 - 1779" or "1780" if re.match(r"^\d{4}(\s*[-–]\s*\d{4})?\s*$", stripped): if current_title and current_body_lines: p = make_poem( current_title, "\n".join(current_body_lines), "Robert Burns", "Poems and Songs of Robert Burns", "1771–1796", ) if p: poems.append(p) current_title = "" current_body_lines = [] i += 1 continue # Title detection: non-blank, short-ish, after blank line, # not all lowercase, contains at least one uppercase word if (stripped and len(stripped) < 80 and not line.startswith(" ") and # Not indented poem body i > 0 and not lines[i-1].strip() and re.search(r"[A-Z][a-z]", stripped) and not stripped.startswith("[") and not stripped.startswith("Footnote")): # Additional checks for Burns titles is_title = False # Song titles: "Song—", "Ballad—", etc. if re.match(r"^(Song|Ballad|Epistle|Elegy|Epitaph|Ode|Address|Epigram|Extempore|Fragment|Prologue|Lament|Lines|Stanzas|Verses|Inscription)[\s—\-:]", stripped): is_title = True # Titles starting with "To " or "On " elif re.match(r"^(To |On |The |A |My |Tam |Holy |Poor |Bonnie |Highland )", stripped): is_title = True # ALL CAPS titles elif stripped == stripped.upper() and len(stripped) > 5: is_title = True # Titles with special chars elif "—" in stripped or stripped.endswith(":"): is_title = True # Check if next non-blank line is indented (poem body) elif i + 1 < len(lines): j = i + 1 while j < len(lines) and not lines[j].strip(): j += 1 if j < len(lines) and (lines[j].startswith(" ") or lines[j].strip() != lines[j].strip().upper()): # Could be a title, check more if not any(stripped.upper().startswith(s) for s in skip_patterns): is_title = True if is_title: # Check for multi-line titles full_title = stripped j = i + 1 while (j < len(lines) and lines[j].strip() and not lines[j].startswith(" ") and len(lines[j].strip()) < 60): # Might be continuation of title next_stripped = lines[j].strip() if re.match(r"^(On |In |To |By |At |For |Or |And )", next_stripped): full_title += " " + next_stripped j += 1 else: break if current_title and current_body_lines: p = make_poem( current_title, "\n".join(current_body_lines), "Robert Burns", "Poems and Songs of Robert Burns", "1771–1796", ) if p: poems.append(p) current_title = full_title current_body_lines = [] i = j continue if current_title: current_body_lines.append(line) i += 1 if current_title and current_body_lines: p = make_poem( current_title, "\n".join(current_body_lines), "Robert Burns", "Poems and Songs of Robert Burns", "1771–1796", ) if p: poems.append(p) return poems def extract_wordsworth(text): """Lyrical Ballads by William Wordsworth (Gutenberg 9622).""" body = extract_body(text) poems = [] # Skip CONTENTS lines = body.split("\n") # Find where poems start (after CONTENTS section) start_idx = 0 for i, line in enumerate(lines): stripped = line.strip() if stripped == "THE RIME OF THE ANCYENT MARINERE" and i > 50: start_idx = i break current_title = "" current_body_lines = [] i = start_idx while i < len(lines): line = lines[i] stripped = line.strip() # Title: ALL CAPS, after blank line if (stripped and len(stripped) < 80 and stripped == stripped.upper() and re.search(r"[A-Z]{3,}", stripped) and not stripped.startswith("[") and not stripped.startswith("NOTE") and not stripped.startswith("***") and not stripped.startswith("PART ") and (i == 0 or not lines[i-1].strip())): if current_title and current_body_lines: p = make_poem( current_title, "\n".join(current_body_lines), "William Wordsworth", "Lyrical Ballads", "1798", ) if p: poems.append(p) current_title = stripped.title() current_body_lines = [] i += 1 continue if current_title: current_body_lines.append(line) i += 1 if current_title and current_body_lines: p = make_poem( current_title, "\n".join(current_body_lines), "William Wordsworth", "Lyrical Ballads", "1798", ) if p: poems.append(p) return poems def extract_shelley(text): """Complete Poetical Works of Shelley (Gutenberg 4800).""" body = extract_body(text) poems = [] lines = body.split("\n") current_title = "" current_body_lines = [] skip_titles = {"CONTENTS", "NOTE", "NOTES", "PREFACE", "INTRODUCTION", "APPENDIX", "DEDICATION", "ADVERTISEMENTS", "MEMOIR", "POSTSCRIPT", "DRAMATIS PERSONAE", "INDEX", "BIBLIOGRAPHY", "TABLE OF CONTENTS"} # Skip editorial / prose content patterns skip_prefixes = ("NOTE BY", "TO ", "INCLUDING", "EDITED", "THOMAS", "MARY W", "LONDON", "POSTSCRIPT") i = 0 while i < len(lines): line = lines[i] stripped = line.strip() # Title: ALL CAPS, short, after blank line if (stripped and 3 < len(stripped) < 70 and stripped == stripped.upper() and re.search(r"[A-Z]{3,}", stripped) and not stripped.startswith("[") and not stripped.startswith("***") and not re.match(r"^(ACT|SCENE|PART)\s", stripped) and not re.match(r"^[IVX]+\.$", stripped) and not re.match(r"^\d+\.", stripped) and (i == 0 or not lines[i-1].strip())): title_word = stripped.split(".")[0].strip() if title_word in skip_titles: i += 1 continue # Skip notes entries if stripped.startswith("NOTES"): i += 1 continue if current_title and current_body_lines: poem_text = "\n".join(current_body_lines) # Only keep poems with real verse content # Skip TOC entries (mostly CANTO/CHAPTER lines) and notes cleaned = poem_text.strip() if (len(cleaned) > 200 and not cleaned.startswith("PREFACE") and not cleaned.startswith("CANTO") and not re.match(r"^(PREFACE|DEDICATION|CANTO|PART|NOTE|_\d)", cleaned)): p = make_poem( current_title, poem_text, "Percy Bysshe Shelley", "Complete Poetical Works of Shelley", "1810–1822", ) if p: poems.append(p) current_title = stripped.title() # Skip editorial prefixes if any(stripped.startswith(sp) for sp in skip_prefixes): current_title = "" current_body_lines = [] i += 1 continue current_body_lines = [] i += 1 continue if current_title: current_body_lines.append(line) i += 1 if current_title and current_body_lines: poem_text = "\n".join(current_body_lines) cleaned = poem_text.strip() if (len(cleaned) > 200 and not cleaned.startswith("PREFACE") and not cleaned.startswith("CANTO") and not re.match(r"^(PREFACE|DEDICATION|CANTO|PART|NOTE|_\d)", cleaned)): p = make_poem( current_title, poem_text, "Percy Bysshe Shelley", "Complete Poetical Works of Shelley", "1810–1822", ) if p: poems.append(p) return poems # ─── Sources ───────────────────────────────────────────────────── SOURCES = [ { "id": 1041, "filename": "shakespeare_sonnets.json", "title": "Shakespeare's Sonnets", "author": "William Shakespeare", "extractor": extract_shakespeare_sonnets, }, { "id": 12242, "filename": "dickinson_poems.json", "title": "Poems by Emily Dickinson", "author": "Emily Dickinson", "extractor": extract_dickinson, }, { "id": 1322, "filename": "whitman_leaves_of_grass.json", "title": "Leaves of Grass", "author": "Walt Whitman", "extractor": extract_whitman, }, { "id": 1934, "filename": "blake_songs.json", "title": "Songs of Innocence and of Experience", "author": "William Blake", "extractor": extract_blake, }, { "id": 23684, "filename": "keats_poems_1820.json", "title": "Poems Published in 1820", "author": "John Keats", "extractor": extract_keats, }, { "id": 10031, "filename": "poe_poetical_works.json", "title": "Complete Poetical Works", "author": "Edgar Allan Poe", "extractor": extract_poe, }, { "id": 2002, "filename": "browning_sonnets_portuguese.json", "title": "Sonnets from the Portuguese", "author": "Elizabeth Barrett Browning", "extractor": extract_browning_sonnets, }, { "id": 1321, "filename": "eliot_waste_land.json", "title": "The Waste Land", "author": "T.S. Eliot", "extractor": extract_eliot_wasteland, }, { "id": 29345, "filename": "frost_mountain_interval.json", "title": "Mountain Interval", "author": "Robert Frost", "extractor": extract_frost_mountain, }, { "id": 59824, "filename": "frost_selected_poems.json", "title": "Selected Poems", "author": "Robert Frost", "extractor": extract_frost_selected, }, { "id": 32233, "filename": "yeats_wind_reeds.json", "title": "The Wind Among the Reeds", "author": "W.B. Yeats", "extractor": extract_yeats, }, { "id": 246, "filename": "khayyam_rubaiyat.json", "title": "The Rubaiyat of Omar Khayyam", "author": "Omar Khayyam", "extractor": extract_khayyam, }, { "id": 1279, "filename": "burns_poems_songs.json", "title": "Poems and Songs", "author": "Robert Burns", "extractor": extract_burns, }, { "id": 9622, "filename": "wordsworth_lyrical_ballads.json", "title": "Lyrical Ballads", "author": "William Wordsworth", "extractor": extract_wordsworth, }, { "id": 4800, "filename": "shelley_poetical_works.json", "title": "Complete Poetical Works", "author": "Percy Bysshe Shelley", "extractor": extract_shelley, }, ] def download_source(source): """Download and parse one source.""" text = fetch_text(source["id"]) poems = source["extractor"](text) print(f" Extracted {len(poems)} poems") return poems def save_poems(poems, filename): """Save poems to JSON file.""" filepath = os.path.join(POETRY_DIR, filename) with open(filepath, "w", encoding="utf-8") as f: json.dump(poems, f, indent=2, ensure_ascii=False) print(f" Saved to {filepath}") def main(): if "--list" in sys.argv: print("Available poetry sources:\n") for s in SOURCES: print(f" {s['author']:35s} {s['title']}") print(f" {'':35s} Gutenberg #{s['id']}") print() return os.makedirs(POETRY_DIR, exist_ok=True) total = 0 for source in SOURCES: print(f"\n{'='*60}") print(f" {source['author']} — {source['title']}") print(f" Gutenberg #{source['id']}") print(f"{'='*60}") try: poems = download_source(source) if poems: save_poems(poems, source["filename"]) total += len(poems) else: print(" WARNING: No poems extracted!") except Exception as e: print(f" ERROR: {e}") print(f"\n{'='*60}") print(f" Total: {total} poems downloaded") print(f"{'='*60}") if __name__ == "__main__": main()