Fix Poe parser and add font size controls

- Rewrite Poe extractor with section tracking: 51 clean poems (was 108 with junk notes, prose, and dividers) - Skip memoir, notes, prose poems, essays, dedications - Properly extract from all 4 poem sections Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
4 months ago · 49d11e8869
parent e4464b6773
commit 49d11e8869
3 changed files with 92 additions and 460 deletions
--- a/download_poetry.py
+++ b/download_poetry.py
@ -412,82 +412,112 @@ def extract_poe(text):
    """Complete Poetical Works of Edgar Allan Poe (Gutenberg 10031)."""
    body = extract_body(text)
    poems = []
-
+    lines_list = body.split("\n")
-    # Find the start of actual poems (after intro/contents)
+
-    # Poems have titles in ALL CAPS separated by blank lines
+    # Top-level sections containing actual poems
-    lines = body.split("\n")
+    poem_sections = {"POEMS OF LATER LIFE", "POEMS OF MANHOOD",
-
+                     "POEMS OF YOUTH", "DOUBTFUL POEMS"}
    # Top-level sections that are NOT poems
    non_poem_sections = {"CONTENTS", "TABLE OF CONTENTS", "NOTES", "NOTE",
                         "MEMOIR", "MEMOIR OF EDGAR ALLAN POE",
                         "PROSE POEMS", "ESSAYS", "NOTE ON POLITIAN",
                         'SCENES FROM "POLITIAN"'}
    # Sub-headings within poem sections to skip
    skip_titles = {"PREFACE", "INTRODUCTION TO POEMS1831",
                   "LETTER TO MR B", "JOHN H INGRAM",
                   "THE NOBLEST OF HER SEX",
                   "MISS ELIZABETH BARRETT BARRETT",
                   "OF ENGLAND", "I DEDICATE THIS VOLUME",
                   "WITH THE MOST ENTHUSIASTIC ADMIRATION AND",
                   "WEST POINT 1831", "DEAR B"}
    in_poem_section = False
    current_title = ""
    current_body_lines = []
    skip_sections = {"CONTENTS", "NOTE", "NOTES", "INDEX", "APPENDIX",
                     "MEMOIR", "PREFACE", "INTRODUCTION", "BIBLIOGRAPHY"}
-    i = 0
+    def _is_title(stripped, idx):
-    while i < len(lines):
+        return (stripped and
        line = lines[i]
        stripped = line.strip()
        # Title detection: short ALL CAPS line after blank line
        if (stripped and
                len(stripped) < 60 and
                stripped == stripped.upper() and
                re.search(r"[A-Z]{3,}", stripped) and
                not stripped.startswith("[") and
                not stripped.startswith("BY ") and
                not re.match(r"^\d+$", stripped) and
                not re.match(r"^\d+[.\s]", stripped) and
                not re.match(r"^[IVXLC]+\.$", stripped) and
                not stripped.startswith("***") and
-            i > 0 and not lines[i-1].strip()):
+                idx > 0 and not lines_list[idx - 1].strip())
-            # Check if this is a section to skip
+    def _save_current():
            if stripped.split(".")[0].strip() in skip_sections:
        if current_title and current_body_lines:
-                    p = make_poem(
+            body_text = "\n".join(current_body_lines)
-                        current_title,
+            cleaned = body_text.strip()
-                        "\n".join(current_body_lines),
+            if (not re.match(r"^[\s*]+$", cleaned)
                    and len(cleaned) >= 50):
                p = make_poem(current_title, body_text,
                              "Edgar Allan Poe",
                              "Complete Poetical Works of Edgar Allan Poe",
-                        "1827–1849",
+                              "1827\u20131849")
                    )
                if p:
                    poems.append(p)
    def _norm(s):
        return re.sub(r"[.,:;\"\'-]", "", s).strip()
    i = 0
    while i < len(lines_list):
        line = lines_list[i]
        stripped = line.strip()
        if _is_title(stripped, i):
            clean = stripped.rstrip(".,:;").rstrip()
            normed = _norm(stripped)
            # Check for poem section header
            if clean in poem_sections or normed in {_norm(s) for s in poem_sections}:
                _save_current()
                in_poem_section = True
                current_title = ""
                current_body_lines = []
                i += 1
                continue
-            # Save previous poem
+            # Check for non-poem section header
-            if current_title and current_body_lines:
+            if clean in non_poem_sections or normed in {_norm(s) for s in non_poem_sections}:
-                p = make_poem(
+                _save_current()
-                    current_title,
+                in_poem_section = False
-                    "\n".join(current_body_lines),
+                current_title = ""
-                    "Edgar Allan Poe",
+                current_body_lines = []
-                    "Complete Poetical Works of Edgar Allan Poe",
+                i += 1
-                    "1827–1849",
+                continue
                )
                if p:
                    poems.append(p)
-            current_title = stripped.title()
+            # Within a poem section
            if in_poem_section:
                # Skip certain sub-headings without starting a poem
                if normed in {_norm(s) for s in skip_titles}:
                    _save_current()
                    current_title = ""
                    current_body_lines = []
                    i += 1
                    continue
-        if current_title:
+                # New poem title
                _save_current()
                title = stripped.title()
                # Give "Part I" / "Part Ii" proper names
                if re.match(r"Part [Ii]+\.", title):
                    title = "Al Aaraaf — " + title
                current_title = title
                current_body_lines = []
                i += 1
                continue
        if in_poem_section and current_title:
            current_body_lines.append(line)
        i += 1
-    if current_title and current_body_lines:
+    _save_current()
        p = make_poem(
            current_title,
            "\n".join(current_body_lines),
            "Edgar Allan Poe",
            "Complete Poetical Works of Edgar Allan Poe",
            "1827–1849",
        )
        if p:
            poems.append(p)
    return poems
--- a/hicalsoft.github.io
+++ b/hicalsoft.github.io
@ -0,0 +1 @@
 Subproject commit 7fd16c08b20d81da497d2efb44af2e83860382f4
--- a/poetry/poe_poetical_works.json
+++ b/poetry/poe_poetical_works.json
		`@ -0,0 +1 @@`
							`Subproject commit 7fd16c08b20d81da497d2efb44af2e83860382f4`