letters/download_poetry.py

#!/usr/bin/env python3
"""Download and parse poetry collections from Project Gutenberg.

This is a maintainer-only tool. End users should use the pre-downloaded
poetry files in the poetry/ directory.

Usage:
    python download_poetry.py          # Download all sources
    python download_poetry.py --list   # List available sources
"""

import json
import os
import re
import sys
import urllib.request

POETRY_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "poetry")
GUTENBERG_URL = "https://www.gutenberg.org/cache/epub/{id}/pg{id}.txt"


def fetch_text(gutenberg_id):
    """Download and return cleaned text from Project Gutenberg."""
    url = GUTENBERG_URL.format(id=gutenberg_id)
    print(f"  Downloading {url} ...")
    req = urllib.request.Request(url, headers={"User-Agent": "PoetryDownloader/1.0"})
    with urllib.request.urlopen(req, timeout=30) as resp:
        raw = resp.read().decode("utf-8-sig")
    # Normalize line endings
    raw = raw.replace("\r\n", "\n").replace("\r", "\n")
    return raw


def extract_body(text):
    """Extract text between Gutenberg START/END markers."""
    start = re.search(r"\*\*\*\s*START OF.*?\*\*\*", text)
    end = re.search(r"\*\*\*\s*END OF.*?\*\*\*", text)
    if start and end:
        return text[start.end():end.start()]
    if start:
        return text[start.end():]
    return text


def clean_poem(text):
    """Clean whitespace from a poem body."""
    lines = text.split("\n")
    # Strip trailing whitespace from each line
    lines = [l.rstrip() for l in lines]
    # Remove leading/trailing blank lines
    while lines and not lines[0].strip():
        lines.pop(0)
    while lines and not lines[-1].strip():
        lines.pop()
    return "\n".join(lines)


def make_poem(title, body, author, source, period):
    """Create a poem dict, skipping empty poems."""
    body = clean_poem(body)
    if not body or len(body.strip()) < 20:
        return None
    return {
        "title": title.strip(),
        "body": body,
        "author": author,
        "source": source,
        "period": period,
    }


# ─── Extractors ─────────────────────────────────────────────────

def extract_shakespeare_sonnets(text):
    """Shakespeare's Sonnets (Gutenberg 1041)."""
    body = extract_body(text)
    poems = []
    # Sonnets are separated by Roman numeral on its own line with double newlines
    parts = re.split(r"\n\n([IVXLC]+)\n\n", body)
    # parts[0] is intro, then alternating: numeral, body
    for i in range(1, len(parts) - 1, 2):
        numeral = parts[i].strip()
        poem_body = parts[i + 1]
        p = make_poem(
            f"Sonnet {numeral}",
            poem_body,
            "William Shakespeare",
            "Shakespeare's Sonnets",
            "1609",
        )
        if p:
            poems.append(p)
    return poems


def extract_dickinson(text):
    """Emily Dickinson's Poems, Three Series (Gutenberg 12242)."""
    body = extract_body(text)
    poems = []

    # Skip the preface - find first poem section
    # Poems are grouped by topic headings (e.g., "I. LIFE.", "II. LOVE.")
    # Individual poems have titles in ALL CAPS or Roman numerals

    # Split into series sections
    # Each poem is preceded by a blank line and a title line (usually short, often ALL CAPS)
    # The structure is: section headings, then poems with short titles

    # Strategy: find patterns like title lines followed by poem bodies
    lines = body.split("\n")

    current_title = ""
    current_body_lines = []
    in_poem = False
    section = ""

    # Skip to after TRANSCRIBER'S NOTE section and the first poem
    start_idx = 0
    for i, line in enumerate(lines):
        if line.strip() == "I. LIFE.":
            start_idx = i
            break

    i = start_idx
    while i < len(lines):
        line = lines[i]
        stripped = line.strip()

        # Section headers like "I. LIFE.", "II. LOVE.", etc.
        if re.match(r"^[IVX]+\.\s+[A-Z]+\.\s*$", stripped):
            # Save previous poem
            if current_title and current_body_lines:
                p = make_poem(
                    current_title,
                    "\n".join(current_body_lines),
                    "Emily Dickinson",
                    "Poems by Emily Dickinson, Three Series",
                    "1890–1896",
                )
                if p:
                    poems.append(p)
                current_title = ""
                current_body_lines = []
            section = stripped
            i += 1
            continue

        # Series dividers
        if "SERIES" in stripped and "SECOND" in stripped or "THIRD" in stripped:
            if current_title and current_body_lines:
                p = make_poem(
                    current_title,
                    "\n".join(current_body_lines),
                    "Emily Dickinson",
                    "Poems by Emily Dickinson, Three Series",
                    "1890–1896",
                )
                if p:
                    poems.append(p)
                current_title = ""
                current_body_lines = []
            i += 1
            continue

        # Poem title: short line, mostly caps, after blank line
        # Dickinson titles are typically short phrases in ALL CAPS
        if (stripped and
            len(stripped) < 60 and
            not stripped.startswith("[") and
            stripped == stripped.upper() and
            re.search(r"[A-Z]{2,}", stripped) and
            not re.match(r"^[IVX]+\.\s+[A-Z]+\.\s*$", stripped) and
            i > 0 and not lines[i-1].strip()):

            # Looks like a title - save previous poem
            if current_title and current_body_lines:
                p = make_poem(
                    current_title,
                    "\n".join(current_body_lines),
                    "Emily Dickinson",
                    "Poems by Emily Dickinson, Three Series",
                    "1890–1896",
                )
                if p:
                    poems.append(p)

            current_title = stripped.title()
            current_body_lines = []
            in_poem = True
            i += 1
            continue

        if in_poem:
            current_body_lines.append(line)

        i += 1

    # Don't forget last poem
    if current_title and current_body_lines:
        p = make_poem(
            current_title,
            "\n".join(current_body_lines),
            "Emily Dickinson",
            "Poems by Emily Dickinson, Three Series",
            "1890–1896",
        )
        if p:
            poems.append(p)

    return poems


def extract_whitman(text):
    """Walt Whitman's Leaves of Grass (Gutenberg 1322)."""
    body = extract_body(text)
    poems = []

    # Poems have titles on their own lines, followed by poem text
    # Some are prefixed with "BOOK I." etc.
    # Title lines are typically short, not indented much
    lines = body.split("\n")

    # Skip the initial inscription
    start_idx = 0
    for i, line in enumerate(lines):
        if "One's-Self I Sing" in line:
            start_idx = i
            break

    current_title = ""
    current_body_lines = []

    # Whitman's poems: title is a non-indented, relatively short line
    # followed by indented poem body
    # Some titles span multiple lines (rare)
    i = start_idx
    while i < len(lines):
        line = lines[i]
        stripped = line.strip()

        # Skip BOOK headers
        if re.match(r"^BOOK\s+[IVXLC]+", stripped):
            i += 1
            continue

        # Title detection: non-blank, mostly un-indented, short-ish
        # Titles in Whitman are like "Song of Myself", "I Sing the Body Electric"
        # They are followed by blank line then indented body
        if (stripped and
            len(stripped) < 80 and
            not line.startswith("  ") and
            not stripped.startswith("[") and
            not stripped.startswith("BOOK") and
            i > 0 and not lines[i-1].strip()):

            # Check if next non-blank line is indented (poem body)
            j = i + 1
            while j < len(lines) and not lines[j].strip():
                j += 1
            if j < len(lines) and lines[j].startswith("  "):
                # This is a title
                if current_title and current_body_lines:
                    p = make_poem(
                        current_title,
                        "\n".join(current_body_lines),
                        "Walt Whitman",
                        "Leaves of Grass",
                        "1891–1892",
                    )
                    if p:
                        poems.append(p)
                current_title = stripped
                current_body_lines = []
                i += 1
                continue

        if current_title:
            current_body_lines.append(line)

        i += 1

    if current_title and current_body_lines:
        p = make_poem(
            current_title,
            "\n".join(current_body_lines),
            "Walt Whitman",
            "Leaves of Grass",
            "1891–1892",
        )
        if p:
            poems.append(p)

    return poems


def extract_blake(text):
    """William Blake's Songs of Innocence and of Experience (Gutenberg 1934)."""
    body = extract_body(text)
    poems = []

    # Remove [Picture: ...] tags
    body = re.sub(r"\[Picture:.*?\]", "", body, flags=re.DOTALL)

    # Find all poem titles (ALL CAPS on their own line, 3+ newlines before)
    # Build a list of (title, start_pos) then extract bodies between them
    title_pattern = re.compile(r"\n\n\n([A-Z][A-Z ,'!?:;\-—]+)\n")
    matches = list(title_pattern.finditer(body))

    skip_titles = {"SONGS OF INNOCENCE", "SONGS OF EXPERIENCE", "CONTENTS",
                   "SONGS OF INNOCENCE AND OF EXPERIENCE"}

    for idx, m in enumerate(matches):
        title = m.group(1).strip()
        if title in skip_titles:
            continue
        start = m.end()
        end = matches[idx + 1].start() if idx + 1 < len(matches) else len(body)
        poem_body = body[start:end]

        p = make_poem(
            title.title(),
            poem_body,
            "William Blake",
            "Songs of Innocence and of Experience",
            "1789–1794",
        )
        if p:
            poems.append(p)

    return poems


def extract_keats(text):
    """Keats: Poems Published in 1820 (Gutenberg 23684)."""
    body = extract_body(text)
    poems = []

    # Extract the main poems using CONTENTS as a guide
    # The poems are: LAMIA (Part I & II), ISABELLA, THE EVE OF ST. AGNES,
    # various Odes, FANCY, HYPERION (Books I-III), etc.
    # Followed by NOTES sections

    # Find where actual poems start (after the CONTENTS and LIFE OF KEATS sections)
    advert_match = re.search(r"\nADVERTISEMENT\.?\n", body)
    if advert_match:
        body = body[advert_match.start():]

    # Split on major poem titles - these appear in ALL CAPS on their own
    # We'll manually identify poem boundaries
    poem_titles = [
        ("ADVERTISEMENT", "LAMIA"),
        ("LAMIA. PART I", "LAMIA. PART II"),
        ("LAMIA. PART II", "ISABELLA"),
        ("ISABELLA; OR, THE POT OF BASIL", "THE EVE OF ST. AGNES"),
        ("THE EVE OF ST. AGNES", "ODE TO A NIGHTINGALE"),
        ("ODE TO A NIGHTINGALE", "ODE ON A GRECIAN URN"),
        ("ODE ON A GRECIAN URN", "ODE TO PSYCHE"),
        ("ODE TO PSYCHE", "FANCY"),
        ("FANCY", "ODE"),
        ("ODE\n", "LINES ON THE MERMAID TAVERN"),
        ("LINES ON THE MERMAID TAVERN", "ROBIN HOOD"),
        ("ROBIN HOOD", "TO AUTUMN"),
        ("TO AUTUMN", "ODE ON MELANCHOLY"),
        ("ODE ON MELANCHOLY", "HYPERION"),
        ("HYPERION. BOOK I", "HYPERION. BOOK II"),
        ("HYPERION. BOOK II", "HYPERION. BOOK III"),
        ("HYPERION. BOOK III", "NOTE ON ADVERTISEMENT"),
    ]

    for title_start, title_end in poem_titles:
        start = body.find(title_start)
        end = body.find(title_end, start + len(title_start)) if title_end else len(body)
        if start == -1:
            continue
        if end == -1:
            end = len(body)

        section = body[start:end]
        # Remove the title line itself
        first_newline = section.find("\n")
        if first_newline != -1:
            poem_body = section[first_newline:]
        else:
            poem_body = section

        # Clean title
        display_title = title_start.replace(". PART ", ", Part ").title()
        if display_title == "Advertisement":
            continue
        if display_title.startswith("Ode\n"):
            display_title = "Ode (Bards of Passion and of Mirth)"

        # Remove editorial line numbers
        poem_body = re.sub(r"\s+\d+$", "", poem_body, flags=re.MULTILINE)
        # Remove footnote markers
        poem_body = re.sub(r"\[\d+\]", "", poem_body)

        p = make_poem(
            display_title,
            poem_body,
            "John Keats",
            "Poems Published in 1820",
            "1820",
        )
        if p:
            poems.append(p)

    return poems


def extract_poe(text):
    """Complete Poetical Works of Edgar Allan Poe (Gutenberg 10031)."""
    body = extract_body(text)
    poems = []

    # Find the start of actual poems (after intro/contents)
    # Poems have titles in ALL CAPS separated by blank lines
    lines = body.split("\n")

    current_title = ""
    current_body_lines = []
    skip_sections = {"CONTENTS", "NOTE", "NOTES", "INDEX", "APPENDIX",
                     "MEMOIR", "PREFACE", "INTRODUCTION", "BIBLIOGRAPHY"}

    i = 0
    while i < len(lines):
        line = lines[i]
        stripped = line.strip()

        # Title detection: short ALL CAPS line after blank line
        if (stripped and
            len(stripped) < 60 and
            stripped == stripped.upper() and
            re.search(r"[A-Z]{3,}", stripped) and
            not stripped.startswith("[") and
            not stripped.startswith("BY ") and
            not re.match(r"^\d+$", stripped) and
            not stripped.startswith("***") and
            i > 0 and not lines[i-1].strip()):

            # Check if this is a section to skip
            if stripped.split(".")[0].strip() in skip_sections:
                if current_title and current_body_lines:
                    p = make_poem(
                        current_title,
                        "\n".join(current_body_lines),
                        "Edgar Allan Poe",
                        "Complete Poetical Works of Edgar Allan Poe",
                        "1827–1849",
                    )
                    if p:
                        poems.append(p)
                current_title = ""
                current_body_lines = []
                i += 1
                continue

            # Save previous poem
            if current_title and current_body_lines:
                p = make_poem(
                    current_title,
                    "\n".join(current_body_lines),
                    "Edgar Allan Poe",
                    "Complete Poetical Works of Edgar Allan Poe",
                    "1827–1849",
                )
                if p:
                    poems.append(p)

            current_title = stripped.title()
            current_body_lines = []
            i += 1
            continue

        if current_title:
            current_body_lines.append(line)

        i += 1

    if current_title and current_body_lines:
        p = make_poem(
            current_title,
            "\n".join(current_body_lines),
            "Edgar Allan Poe",
            "Complete Poetical Works of Edgar Allan Poe",
            "1827–1849",
        )
        if p:
            poems.append(p)

    return poems


def extract_browning_sonnets(text):
    """Sonnets from the Portuguese by Elizabeth Barrett Browning (Gutenberg 2002)."""
    body = extract_body(text)
    poems = []

    # Split on Roman numeral headers
    parts = re.split(r"\n\n\n+([IVXLC]+)\n\n\n", body)

    for i in range(1, len(parts) - 1, 2):
        numeral = parts[i].strip()
        poem_body = parts[i + 1]
        p = make_poem(
            f"Sonnet {numeral}",
            poem_body,
            "Elizabeth Barrett Browning",
            "Sonnets from the Portuguese",
            "1850",
        )
        if p:
            poems.append(p)

    return poems


def extract_eliot_wasteland(text):
    """The Waste Land by T.S. Eliot (Gutenberg 1321)."""
    body = extract_body(text)
    poems = []

    section_defs = [
        ("I", "THE BURIAL OF THE DEAD"),
        ("II", "A GAME OF CHESS"),
        ("III", "THE FIRE SERMON"),
        ("IV", "DEATH BY WATER"),
        ("V", "WHAT THE THUNDER SAID"),
    ]

    # Find each section's actual occurrence (not CONTENTS/NOTES)
    # The actual poem sections have the text on a non-indented line followed by poem body
    positions = []
    for num, title in section_defs:
        pattern = re.compile(
            r"^\s+" + re.escape(num) + r"\.\s+" + re.escape(title) + r"\s*$",
            re.MULTILINE,
        )
        matches = list(pattern.finditer(body))
        # Use the second occurrence (first is CONTENTS)
        if len(matches) >= 2:
            positions.append((f"The Waste Land: {title.title()}", matches[1].start(), matches[1].end()))
        elif matches:
            positions.append((f"The Waste Land: {title.title()}", matches[0].start(), matches[0].end()))

    # Find NOTES section (second occurrence, after the poem)
    notes_matches = list(re.finditer(r"NOTES ON", body))
    notes_pos = notes_matches[1].start() if len(notes_matches) >= 2 else (
        notes_matches[0].start() if notes_matches else len(body))

    for i, (title, sec_start, sec_text_start) in enumerate(positions):
        end = positions[i + 1][1] if i + 1 < len(positions) else notes_pos
        section = body[sec_text_start:end]
        # Remove line numbers
        section = re.sub(r"\s+\d+$", "", section, flags=re.MULTILINE)

        p = make_poem(
            title,
            section,
            "T.S. Eliot",
            "The Waste Land",
            "1922",
        )
        if p:
            poems.append(p)

    return poems


def extract_frost_mountain(text):
    """Robert Frost's Mountain Interval (Gutenberg 29345)."""
    body = extract_body(text)
    poems = []

    # Poems have titles in _italics_ markup or ALL CAPS
    # Pattern: _TITLE_ or _Title_ on own line
    lines = body.split("\n")

    # Find start of poems (after CONTENTS)
    start_idx = 0
    for i, line in enumerate(lines):
        if "_THE ROAD NOT TAKEN_" in line or "THE ROAD NOT TAKEN" in line:
            start_idx = i
            break

    current_title = ""
    current_body_lines = []

    i = start_idx
    while i < len(lines):
        line = lines[i]
        stripped = line.strip()

        # Title pattern: _TITLE_ or just an ALL CAPS short line
        title_match = re.match(r"^_([A-Z][A-Z ,'!?:\-.—\"]+)_$", stripped)
        if not title_match:
            # Also match non-italic titles
            if (stripped and len(stripped) < 60 and
                stripped == stripped.upper() and
                re.search(r"[A-Z]{3,}", stripped) and
                not stripped.startswith("[") and
                i > 0 and not lines[i-1].strip()):
                title_match = True
                title_text = stripped.title()
            else:
                title_match = None
        else:
            title_text = title_match.group(1).title()
            title_match = True

        if title_match:
            # Check for subtitle on next line
            sub_title = ""
            if i + 1 < len(lines) and lines[i + 1].strip():
                sub = lines[i + 1].strip()
                # Subtitles like "I LONELINESS--HER WORD"
                if re.match(r"^_?[IVX]+\s", sub) or re.match(r"^\d+\s", sub):
                    pass  # numbered sub-parts, don't treat as subtitle

            if current_title and current_body_lines:
                p = make_poem(
                    current_title,
                    "\n".join(current_body_lines),
                    "Robert Frost",
                    "Mountain Interval",
                    "1916",
                )
                if p:
                    poems.append(p)

            current_title = title_text
            current_body_lines = []
            i += 1
            continue

        if current_title:
            current_body_lines.append(line)

        i += 1

    if current_title and current_body_lines:
        p = make_poem(
            current_title,
            "\n".join(current_body_lines),
            "Robert Frost",
            "Mountain Interval",
            "1916",
        )
        if p:
            poems.append(p)

    return poems


def extract_frost_selected(text):
    """Robert Frost's Selected Poems (Gutenberg 59824)."""
    body = extract_body(text)
    poems = []

    lines = body.split("\n")

    # Find start of poems
    start_idx = 0
    for i, line in enumerate(lines):
        if "THE PASTURE" in line and i > 50:
            start_idx = i
            break

    current_title = ""
    current_body_lines = []

    i = start_idx
    while i < len(lines):
        line = lines[i]
        stripped = line.strip()

        # Section dividers like "I", "II", etc. (Roman numerals alone)
        if re.match(r"^[IVX]+$", stripped) and i > 0 and not lines[i-1].strip():
            i += 1
            continue

        # Title: ALL CAPS, short, after blank line
        if (stripped and len(stripped) < 70 and
            stripped == stripped.upper() and
            re.search(r"[A-Z]{3,}", stripped) and
            not stripped.startswith("[") and
            not re.match(r"^[IVX]+$", stripped) and
            i > 0 and not lines[i-1].strip()):

            if current_title and current_body_lines:
                p = make_poem(
                    current_title,
                    "\n".join(current_body_lines),
                    "Robert Frost",
                    "Selected Poems",
                    "1913–1916",
                )
                if p:
                    poems.append(p)

            current_title = stripped.title()
            current_body_lines = []
            i += 1
            continue

        if current_title:
            current_body_lines.append(line)

        i += 1

    if current_title and current_body_lines:
        p = make_poem(
            current_title,
            "\n".join(current_body_lines),
            "Robert Frost",
            "Selected Poems",
            "1913–1916",
        )
        if p:
            poems.append(p)

    return poems


def extract_yeats(text):
    """W.B. Yeats' The Wind Among the Reeds (Gutenberg 32233)."""
    body = extract_body(text)
    poems = []

    # Poems have ALL CAPS titles separated by blank lines
    # After CONTENTS section and dedication
    # Find first poem
    idx = body.find("THE HOSTING OF THE SIDHE\n")
    if idx == -1:
        return poems

    # Search for the second occurrence (after CONTENTS)
    idx2 = body.find("THE HOSTING OF THE SIDHE\n", idx + 10)
    if idx2 != -1:
        body = body[idx2:]
    else:
        body = body[idx:]

    # Split on ALL CAPS title lines
    lines = body.split("\n")
    current_title = ""
    current_body_lines = []

    for i, line in enumerate(lines):
        stripped = line.strip()

        # Title: ALL CAPS, after blank line
        if (stripped and
            len(stripped) < 80 and
            stripped == stripped.upper() and
            re.search(r"[A-Z]{3,}", stripped) and
            not stripped.startswith("[") and
            not stripped.startswith("NOTE") and
            (i == 0 or not lines[i-1].strip())):

            if current_title and current_body_lines:
                p = make_poem(
                    current_title,
                    "\n".join(current_body_lines),
                    "W.B. Yeats",
                    "The Wind Among the Reeds",
                    "1899",
                )
                if p:
                    poems.append(p)

            current_title = stripped.title()
            current_body_lines = []
            continue

        if current_title:
            current_body_lines.append(line)

    if current_title and current_body_lines:
        p = make_poem(
            current_title,
            "\n".join(current_body_lines),
            "W.B. Yeats",
            "The Wind Among the Reeds",
            "1899",
        )
        if p:
            poems.append(p)

    return poems


def extract_khayyam(text):
    """The Rubaiyat of Omar Khayyam (Gutenberg 246)."""
    body = extract_body(text)
    poems = []

    # Extract both First and Fifth editions
    for edition, label in [("First Edition", "First Edition"),
                           ("Fifth Edition", "Fifth Edition")]:
        # Find ALL occurrences and use the one that's followed by quatrains
        positions = [m.start() for m in re.finditer(re.escape(edition), body)]
        ed_start = None
        for pos in positions:
            # Check if this is followed by "\n\n\nI.\n\n" nearby
            chunk = body[pos:pos+200]
            if re.search(r"\n\n+I\.\n\n", chunk):
                ed_start = pos
                break
        if ed_start is None:
            continue

        ed_body = body[ed_start:]
        # Find the end: next edition or Notes section (far away)
        # Look for next major section boundary
        end_match = re.search(r"\n\n\n\n\n(First|Fifth) Edition", ed_body[200:])
        notes_match = re.search(r"\n\n\n\n\nNotes", ed_body[200:])

        if end_match:
            ed_body = ed_body[:end_match.start() + 200]
        elif notes_match:
            ed_body = ed_body[:notes_match.start() + 200]

        # Quatrains numbered: "I.\n\n" with varying leading newlines
        parts = re.split(r"\n\n+([IVXLC]+)\.\n\n", ed_body)
        for i in range(1, len(parts) - 1, 2):
            numeral = parts[i].strip()
            quatrain = parts[i + 1]
            p = make_poem(
                f"Quatrain {numeral} ({label})",
                quatrain,
                "Omar Khayyam (trans. Edward FitzGerald)",
                "The Rubaiyat of Omar Khayyam",
                "11th–12th century",
            )
            if p:
                poems.append(p)

    return poems


def extract_burns(text):
    """Poems and Songs of Robert Burns (Gutenberg 1279)."""
    body = extract_body(text)
    poems = []

    lines = body.split("\n")
    current_title = ""
    current_body_lines = []

    # Burns has a very large collection organized by year
    # Titles are like "Song—Handsome Nell", "To A Mouse", etc.
    # They appear after blank lines, are relatively short, and mixed case

    skip_patterns = {"INTRODUCTORY NOTE", "GLOSSARY", "INDEX", "NOTES",
                     "APPENDIX", "CONTENTS", "PREFACE"}

    i = 0
    while i < len(lines):
        line = lines[i]
        stripped = line.strip()

        # Year headings like "1771 - 1779" or "1780"
        if re.match(r"^\d{4}(\s*[-–]\s*\d{4})?\s*$", stripped):
            if current_title and current_body_lines:
                p = make_poem(
                    current_title,
                    "\n".join(current_body_lines),
                    "Robert Burns",
                    "Poems and Songs of Robert Burns",
                    "1771–1796",
                )
                if p:
                    poems.append(p)
                current_title = ""
                current_body_lines = []
            i += 1
            continue

        # Title detection: non-blank, short-ish, after blank line,
        # not all lowercase, contains at least one uppercase word
        if (stripped and
            len(stripped) < 80 and
            not line.startswith("     ") and  # Not indented poem body
            i > 0 and not lines[i-1].strip() and
            re.search(r"[A-Z][a-z]", stripped) and
            not stripped.startswith("[") and
            not stripped.startswith("Footnote")):

            # Additional checks for Burns titles
            is_title = False

            # Song titles: "Song—", "Ballad—", etc.
            if re.match(r"^(Song|Ballad|Epistle|Elegy|Epitaph|Ode|Address|Epigram|Extempore|Fragment|Prologue|Lament|Lines|Stanzas|Verses|Inscription)[\s—\-:]", stripped):
                is_title = True
            # Titles starting with "To " or "On "
            elif re.match(r"^(To |On |The |A |My |Tam |Holy |Poor |Bonnie |Highland )", stripped):
                is_title = True
            # ALL CAPS titles
            elif stripped == stripped.upper() and len(stripped) > 5:
                is_title = True
            # Titles with special chars
            elif "—" in stripped or stripped.endswith(":"):
                is_title = True
            # Check if next non-blank line is indented (poem body)
            elif i + 1 < len(lines):
                j = i + 1
                while j < len(lines) and not lines[j].strip():
                    j += 1
                if j < len(lines) and (lines[j].startswith("  ") or
                                        lines[j].strip() != lines[j].strip().upper()):
                    # Could be a title, check more
                    if not any(stripped.upper().startswith(s) for s in skip_patterns):
                        is_title = True

            if is_title:
                # Check for multi-line titles
                full_title = stripped
                j = i + 1
                while (j < len(lines) and lines[j].strip() and
                       not lines[j].startswith("  ") and
                       len(lines[j].strip()) < 60):
                    # Might be continuation of title
                    next_stripped = lines[j].strip()
                    if re.match(r"^(On |In |To |By |At |For |Or |And )", next_stripped):
                        full_title += " " + next_stripped
                        j += 1
                    else:
                        break

                if current_title and current_body_lines:
                    p = make_poem(
                        current_title,
                        "\n".join(current_body_lines),
                        "Robert Burns",
                        "Poems and Songs of Robert Burns",
                        "1771–1796",
                    )
                    if p:
                        poems.append(p)

                current_title = full_title
                current_body_lines = []
                i = j
                continue

        if current_title:
            current_body_lines.append(line)

        i += 1

    if current_title and current_body_lines:
        p = make_poem(
            current_title,
            "\n".join(current_body_lines),
            "Robert Burns",
            "Poems and Songs of Robert Burns",
            "1771–1796",
        )
        if p:
            poems.append(p)

    return poems


def extract_wordsworth(text):
    """Lyrical Ballads by William Wordsworth (Gutenberg 9622)."""
    body = extract_body(text)
    poems = []

    # Skip CONTENTS
    lines = body.split("\n")

    # Find where poems start (after CONTENTS section)
    start_idx = 0
    for i, line in enumerate(lines):
        stripped = line.strip()
        if stripped == "THE RIME OF THE ANCYENT MARINERE" and i > 50:
            start_idx = i
            break

    current_title = ""
    current_body_lines = []

    i = start_idx
    while i < len(lines):
        line = lines[i]
        stripped = line.strip()

        # Title: ALL CAPS, after blank line
        if (stripped and
            len(stripped) < 80 and
            stripped == stripped.upper() and
            re.search(r"[A-Z]{3,}", stripped) and
            not stripped.startswith("[") and
            not stripped.startswith("NOTE") and
            not stripped.startswith("***") and
            not stripped.startswith("PART ") and
            (i == 0 or not lines[i-1].strip())):

            if current_title and current_body_lines:
                p = make_poem(
                    current_title,
                    "\n".join(current_body_lines),
                    "William Wordsworth",
                    "Lyrical Ballads",
                    "1798",
                )
                if p:
                    poems.append(p)

            current_title = stripped.title()
            current_body_lines = []
            i += 1
            continue

        if current_title:
            current_body_lines.append(line)

        i += 1

    if current_title and current_body_lines:
        p = make_poem(
            current_title,
            "\n".join(current_body_lines),
            "William Wordsworth",
            "Lyrical Ballads",
            "1798",
        )
        if p:
            poems.append(p)

    return poems


def extract_shelley(text):
    """Complete Poetical Works of Shelley (Gutenberg 4800)."""
    body = extract_body(text)
    poems = []

    lines = body.split("\n")
    current_title = ""
    current_body_lines = []

    skip_titles = {"CONTENTS", "NOTE", "NOTES", "PREFACE", "INTRODUCTION",
                   "APPENDIX", "DEDICATION", "ADVERTISEMENTS", "MEMOIR",
                   "POSTSCRIPT", "DRAMATIS PERSONAE", "INDEX",
                   "BIBLIOGRAPHY", "TABLE OF CONTENTS"}
    # Skip editorial / prose content patterns
    skip_prefixes = ("NOTE BY", "TO ", "INCLUDING", "EDITED", "THOMAS",
                     "MARY W", "LONDON", "POSTSCRIPT")

    i = 0
    while i < len(lines):
        line = lines[i]
        stripped = line.strip()

        # Title: ALL CAPS, short, after blank line
        if (stripped and
            3 < len(stripped) < 70 and
            stripped == stripped.upper() and
            re.search(r"[A-Z]{3,}", stripped) and
            not stripped.startswith("[") and
            not stripped.startswith("***") and
            not re.match(r"^(ACT|SCENE|PART)\s", stripped) and
            not re.match(r"^[IVX]+\.$", stripped) and
            not re.match(r"^\d+\.", stripped) and
            (i == 0 or not lines[i-1].strip())):

            title_word = stripped.split(".")[0].strip()
            if title_word in skip_titles:
                i += 1
                continue
            # Skip notes entries
            if stripped.startswith("NOTES"):
                i += 1
                continue

            if current_title and current_body_lines:
                poem_text = "\n".join(current_body_lines)
                # Only keep poems with real verse content
                # Skip TOC entries (mostly CANTO/CHAPTER lines) and notes
                cleaned = poem_text.strip()
                if (len(cleaned) > 200 and
                    not cleaned.startswith("PREFACE") and
                    not cleaned.startswith("CANTO") and
                    not re.match(r"^(PREFACE|DEDICATION|CANTO|PART|NOTE|_\d)", cleaned)):
                    p = make_poem(
                        current_title,
                        poem_text,
                        "Percy Bysshe Shelley",
                        "Complete Poetical Works of Shelley",
                        "1810–1822",
                    )
                    if p:
                        poems.append(p)

            current_title = stripped.title()
            # Skip editorial prefixes
            if any(stripped.startswith(sp) for sp in skip_prefixes):
                current_title = ""
                current_body_lines = []
                i += 1
                continue
            current_body_lines = []
            i += 1
            continue

        if current_title:
            current_body_lines.append(line)

        i += 1

    if current_title and current_body_lines:
        poem_text = "\n".join(current_body_lines)
        cleaned = poem_text.strip()
        if (len(cleaned) > 200 and
            not cleaned.startswith("PREFACE") and
            not cleaned.startswith("CANTO") and
            not re.match(r"^(PREFACE|DEDICATION|CANTO|PART|NOTE|_\d)", cleaned)):
            p = make_poem(
                current_title,
                poem_text,
                "Percy Bysshe Shelley",
                "Complete Poetical Works of Shelley",
                "1810–1822",
            )
            if p:
                poems.append(p)

    return poems


# ─── Sources ─────────────────────────────────────────────────────

SOURCES = [
    {
        "id": 1041,
        "filename": "shakespeare_sonnets.json",
        "title": "Shakespeare's Sonnets",
        "author": "William Shakespeare",
        "extractor": extract_shakespeare_sonnets,
    },
    {
        "id": 12242,
        "filename": "dickinson_poems.json",
        "title": "Poems by Emily Dickinson",
        "author": "Emily Dickinson",
        "extractor": extract_dickinson,
    },
    {
        "id": 1322,
        "filename": "whitman_leaves_of_grass.json",
        "title": "Leaves of Grass",
        "author": "Walt Whitman",
        "extractor": extract_whitman,
    },
    {
        "id": 1934,
        "filename": "blake_songs.json",
        "title": "Songs of Innocence and of Experience",
        "author": "William Blake",
        "extractor": extract_blake,
    },
    {
        "id": 23684,
        "filename": "keats_poems_1820.json",
        "title": "Poems Published in 1820",
        "author": "John Keats",
        "extractor": extract_keats,
    },
    {
        "id": 10031,
        "filename": "poe_poetical_works.json",
        "title": "Complete Poetical Works",
        "author": "Edgar Allan Poe",
        "extractor": extract_poe,
    },
    {
        "id": 2002,
        "filename": "browning_sonnets_portuguese.json",
        "title": "Sonnets from the Portuguese",
        "author": "Elizabeth Barrett Browning",
        "extractor": extract_browning_sonnets,
    },
    {
        "id": 1321,
        "filename": "eliot_waste_land.json",
        "title": "The Waste Land",
        "author": "T.S. Eliot",
        "extractor": extract_eliot_wasteland,
    },
    {
        "id": 29345,
        "filename": "frost_mountain_interval.json",
        "title": "Mountain Interval",
        "author": "Robert Frost",
        "extractor": extract_frost_mountain,
    },
    {
        "id": 59824,
        "filename": "frost_selected_poems.json",
        "title": "Selected Poems",
        "author": "Robert Frost",
        "extractor": extract_frost_selected,
    },
    {
        "id": 32233,
        "filename": "yeats_wind_reeds.json",
        "title": "The Wind Among the Reeds",
        "author": "W.B. Yeats",
        "extractor": extract_yeats,
    },
    {
        "id": 246,
        "filename": "khayyam_rubaiyat.json",
        "title": "The Rubaiyat of Omar Khayyam",
        "author": "Omar Khayyam",
        "extractor": extract_khayyam,
    },
    {
        "id": 1279,
        "filename": "burns_poems_songs.json",
        "title": "Poems and Songs",
        "author": "Robert Burns",
        "extractor": extract_burns,
    },
    {
        "id": 9622,
        "filename": "wordsworth_lyrical_ballads.json",
        "title": "Lyrical Ballads",
        "author": "William Wordsworth",
        "extractor": extract_wordsworth,
    },
    {
        "id": 4800,
        "filename": "shelley_poetical_works.json",
        "title": "Complete Poetical Works",
        "author": "Percy Bysshe Shelley",
        "extractor": extract_shelley,
    },
]


def download_source(source):
    """Download and parse one source."""
    text = fetch_text(source["id"])
    poems = source["extractor"](text)
    print(f"  Extracted {len(poems)} poems")
    return poems


def save_poems(poems, filename):
    """Save poems to JSON file."""
    filepath = os.path.join(POETRY_DIR, filename)
    with open(filepath, "w", encoding="utf-8") as f:
        json.dump(poems, f, indent=2, ensure_ascii=False)
    print(f"  Saved to {filepath}")


def main():
    if "--list" in sys.argv:
        print("Available poetry sources:\n")
        for s in SOURCES:
            print(f"  {s['author']:35s}  {s['title']}")
            print(f"  {'':35s}  Gutenberg #{s['id']}")
            print()
        return

    os.makedirs(POETRY_DIR, exist_ok=True)

    total = 0
    for source in SOURCES:
        print(f"\n{'='*60}")
        print(f"  {source['author']} — {source['title']}")
        print(f"  Gutenberg #{source['id']}")
        print(f"{'='*60}")
        try:
            poems = download_source(source)
            if poems:
                save_poems(poems, source["filename"])
                total += len(poems)
            else:
                print("  WARNING: No poems extracted!")
        except Exception as e:
            print(f"  ERROR: {e}")

    print(f"\n{'='*60}")
    print(f"  Total: {total} poems downloaded")
    print(f"{'='*60}")


if __name__ == "__main__":
    main()