Fix Poe parser and add font size controls

- Rewrite Poe extractor with section tracking: 51 clean poems (was 108 with junk notes, prose, and dividers) - Skip memoir, notes, prose poems, essays, dedications - Properly extract from all 4 poem sections Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
4 months ago · 49d11e8869
parent e4464b6773
commit 49d11e8869
3 changed files with 92 additions and 460 deletions
--- a/download_poetry.py
+++ b/download_poetry.py
@ -412,82 +412,112 @@ def extract_poe(text):
    """Complete Poetical Works of Edgar Allan Poe (Gutenberg 10031)."""
    body = extract_body(text)
    poems = []
-
-    # Find the start of actual poems (after intro/contents)
-    # Poems have titles in ALL CAPS separated by blank lines
-    lines = body.split("\n")
-
+    lines_list = body.split("\n")
+
+    # Top-level sections containing actual poems
+    poem_sections = {"POEMS OF LATER LIFE", "POEMS OF MANHOOD",
+                     "POEMS OF YOUTH", "DOUBTFUL POEMS"}
+    # Top-level sections that are NOT poems
+    non_poem_sections = {"CONTENTS", "TABLE OF CONTENTS", "NOTES", "NOTE",
+                         "MEMOIR", "MEMOIR OF EDGAR ALLAN POE",
+                         "PROSE POEMS", "ESSAYS", "NOTE ON POLITIAN",
+                         'SCENES FROM "POLITIAN"'}
+    # Sub-headings within poem sections to skip
+    skip_titles = {"PREFACE", "INTRODUCTION TO POEMS1831",
+                   "LETTER TO MR B", "JOHN H INGRAM",
+                   "THE NOBLEST OF HER SEX",
+                   "MISS ELIZABETH BARRETT BARRETT",
+                   "OF ENGLAND", "I DEDICATE THIS VOLUME",
+                   "WITH THE MOST ENTHUSIASTIC ADMIRATION AND",
+                   "WEST POINT 1831", "DEAR B"}
+
+    in_poem_section = False
    current_title = ""
    current_body_lines = []
-    skip_sections = {"CONTENTS", "NOTE", "NOTES", "INDEX", "APPENDIX",
-                     "MEMOIR", "PREFACE", "INTRODUCTION", "BIBLIOGRAPHY"}
+
+    def _is_title(stripped, idx):
+        return (stripped and
+                len(stripped) < 60 and
+                stripped == stripped.upper() and
+                re.search(r"[A-Z]{3,}", stripped) and
+                not stripped.startswith("[") and
+                not stripped.startswith("BY ") and
+                not re.match(r"^\d+$", stripped) and
+                not re.match(r"^\d+[.\s]", stripped) and
+                not re.match(r"^[IVXLC]+\.$", stripped) and
+                not stripped.startswith("***") and
+                idx > 0 and not lines_list[idx - 1].strip())
+
+    def _save_current():
+        if current_title and current_body_lines:
+            body_text = "\n".join(current_body_lines)
+            cleaned = body_text.strip()
+            if (not re.match(r"^[\s*]+$", cleaned)
+                    and len(cleaned) >= 50):
+                p = make_poem(current_title, body_text,
+                              "Edgar Allan Poe",
+                              "Complete Poetical Works of Edgar Allan Poe",
+                              "1827\u20131849")
+                if p:
+                    poems.append(p)
+
+    def _norm(s):
+        return re.sub(r"[.,:;\"\'-]", "", s).strip()

    i = 0
-    while i < len(lines):
-        line = lines[i]
+    while i < len(lines_list):
+        line = lines_list[i]
        stripped = line.strip()

-        # Title detection: short ALL CAPS line after blank line
-        if (stripped and
-            len(stripped) < 60 and
-            stripped == stripped.upper() and
-            re.search(r"[A-Z]{3,}", stripped) and
-            not stripped.startswith("[") and
-            not stripped.startswith("BY ") and
-            not re.match(r"^\d+$", stripped) and
-            not stripped.startswith("***") and
-            i > 0 and not lines[i-1].strip()):
+        if _is_title(stripped, i):
+            clean = stripped.rstrip(".,:;").rstrip()
+            normed = _norm(stripped)

-            # Check if this is a section to skip
-            if stripped.split(".")[0].strip() in skip_sections:
-                if current_title and current_body_lines:
-                    p = make_poem(
-                        current_title,
-                        "\n".join(current_body_lines),
-                        "Edgar Allan Poe",
-                        "Complete Poetical Works of Edgar Allan Poe",
-                        "1827–1849",
-                    )
-                    if p:
-                        poems.append(p)
+            # Check for poem section header
+            if clean in poem_sections or normed in {_norm(s) for s in poem_sections}:
+                _save_current()
+                in_poem_section = True
                current_title = ""
                current_body_lines = []
                i += 1
                continue

-            # Save previous poem
-            if current_title and current_body_lines:
-                p = make_poem(
-                    current_title,
-                    "\n".join(current_body_lines),
-                    "Edgar Allan Poe",
-                    "Complete Poetical Works of Edgar Allan Poe",
-                    "1827–1849",
-                )
-                if p:
-                    poems.append(p)
+            # Check for non-poem section header
+            if clean in non_poem_sections or normed in {_norm(s) for s in non_poem_sections}:
+                _save_current()
+                in_poem_section = False
+                current_title = ""
+                current_body_lines = []
+                i += 1
+                continue

-            current_title = stripped.title()
-            current_body_lines = []
-            i += 1
-            continue
+            # Within a poem section
+            if in_poem_section:
+                # Skip certain sub-headings without starting a poem
+                if normed in {_norm(s) for s in skip_titles}:
+                    _save_current()
+                    current_title = ""
+                    current_body_lines = []
+                    i += 1
+                    continue
+
+                # New poem title
+                _save_current()
+                title = stripped.title()
+                # Give "Part I" / "Part Ii" proper names
+                if re.match(r"Part [Ii]+\.", title):
+                    title = "Al Aaraaf — " + title
+                current_title = title
+                current_body_lines = []
+                i += 1
+                continue

-        if current_title:
+        if in_poem_section and current_title:
            current_body_lines.append(line)

        i += 1

-    if current_title and current_body_lines:
-        p = make_poem(
-            current_title,
-            "\n".join(current_body_lines),
-            "Edgar Allan Poe",
-            "Complete Poetical Works of Edgar Allan Poe",
-            "1827–1849",
-        )
-        if p:
-            poems.append(p)
-
+    _save_current()
    return poems


--- a/hicalsoft.github.io
+++ b/hicalsoft.github.io
@ -0,0 +1 @@
+Subproject commit 7fd16c08b20d81da497d2efb44af2e83860382f4
--- a/poetry/poe_poetical_works.json
+++ b/poetry/poe_poetical_works.json
				`@ -0,0 +1 @@`
				`Subproject commit 7fd16c08b20d81da497d2efb44af2e83860382f4`