Fix Poe parser and add font size controls

- Rewrite Poe extractor with section tracking: 51 clean poems
  (was 108 with junk notes, prose, and dividers)
- Skip memoir, notes, prose poems, essays, dedications
- Properly extract from all 4 poem sections

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
main
Ibraheem Saleh 1 week ago
parent e4464b6773
commit 49d11e8869

@ -412,82 +412,112 @@ def extract_poe(text):
"""Complete Poetical Works of Edgar Allan Poe (Gutenberg 10031)."""
body = extract_body(text)
poems = []
# Find the start of actual poems (after intro/contents)
# Poems have titles in ALL CAPS separated by blank lines
lines = body.split("\n")
lines_list = body.split("\n")
# Top-level sections containing actual poems
poem_sections = {"POEMS OF LATER LIFE", "POEMS OF MANHOOD",
"POEMS OF YOUTH", "DOUBTFUL POEMS"}
# Top-level sections that are NOT poems
non_poem_sections = {"CONTENTS", "TABLE OF CONTENTS", "NOTES", "NOTE",
"MEMOIR", "MEMOIR OF EDGAR ALLAN POE",
"PROSE POEMS", "ESSAYS", "NOTE ON POLITIAN",
'SCENES FROM "POLITIAN"'}
# Sub-headings within poem sections to skip
skip_titles = {"PREFACE", "INTRODUCTION TO POEMS1831",
"LETTER TO MR B", "JOHN H INGRAM",
"THE NOBLEST OF HER SEX",
"MISS ELIZABETH BARRETT BARRETT",
"OF ENGLAND", "I DEDICATE THIS VOLUME",
"WITH THE MOST ENTHUSIASTIC ADMIRATION AND",
"WEST POINT 1831", "DEAR B"}
in_poem_section = False
current_title = ""
current_body_lines = []
skip_sections = {"CONTENTS", "NOTE", "NOTES", "INDEX", "APPENDIX",
"MEMOIR", "PREFACE", "INTRODUCTION", "BIBLIOGRAPHY"}
def _is_title(stripped, idx):
return (stripped and
len(stripped) < 60 and
stripped == stripped.upper() and
re.search(r"[A-Z]{3,}", stripped) and
not stripped.startswith("[") and
not stripped.startswith("BY ") and
not re.match(r"^\d+$", stripped) and
not re.match(r"^\d+[.\s]", stripped) and
not re.match(r"^[IVXLC]+\.$", stripped) and
not stripped.startswith("***") and
idx > 0 and not lines_list[idx - 1].strip())
def _save_current():
if current_title and current_body_lines:
body_text = "\n".join(current_body_lines)
cleaned = body_text.strip()
if (not re.match(r"^[\s*]+$", cleaned)
and len(cleaned) >= 50):
p = make_poem(current_title, body_text,
"Edgar Allan Poe",
"Complete Poetical Works of Edgar Allan Poe",
"1827\u20131849")
if p:
poems.append(p)
def _norm(s):
return re.sub(r"[.,:;\"\'-]", "", s).strip()
i = 0
while i < len(lines):
line = lines[i]
while i < len(lines_list):
line = lines_list[i]
stripped = line.strip()
# Title detection: short ALL CAPS line after blank line
if (stripped and
len(stripped) < 60 and
stripped == stripped.upper() and
re.search(r"[A-Z]{3,}", stripped) and
not stripped.startswith("[") and
not stripped.startswith("BY ") and
not re.match(r"^\d+$", stripped) and
not stripped.startswith("***") and
i > 0 and not lines[i-1].strip()):
if _is_title(stripped, i):
clean = stripped.rstrip(".,:;").rstrip()
normed = _norm(stripped)
# Check if this is a section to skip
if stripped.split(".")[0].strip() in skip_sections:
if current_title and current_body_lines:
p = make_poem(
current_title,
"\n".join(current_body_lines),
"Edgar Allan Poe",
"Complete Poetical Works of Edgar Allan Poe",
"18271849",
)
if p:
poems.append(p)
# Check for poem section header
if clean in poem_sections or normed in {_norm(s) for s in poem_sections}:
_save_current()
in_poem_section = True
current_title = ""
current_body_lines = []
i += 1
continue
# Save previous poem
if current_title and current_body_lines:
p = make_poem(
current_title,
"\n".join(current_body_lines),
"Edgar Allan Poe",
"Complete Poetical Works of Edgar Allan Poe",
"18271849",
)
if p:
poems.append(p)
# Check for non-poem section header
if clean in non_poem_sections or normed in {_norm(s) for s in non_poem_sections}:
_save_current()
in_poem_section = False
current_title = ""
current_body_lines = []
i += 1
continue
current_title = stripped.title()
current_body_lines = []
i += 1
continue
# Within a poem section
if in_poem_section:
# Skip certain sub-headings without starting a poem
if normed in {_norm(s) for s in skip_titles}:
_save_current()
current_title = ""
current_body_lines = []
i += 1
continue
# New poem title
_save_current()
title = stripped.title()
# Give "Part I" / "Part Ii" proper names
if re.match(r"Part [Ii]+\.", title):
title = "Al Aaraaf — " + title
current_title = title
current_body_lines = []
i += 1
continue
if current_title:
if in_poem_section and current_title:
current_body_lines.append(line)
i += 1
if current_title and current_body_lines:
p = make_poem(
current_title,
"\n".join(current_body_lines),
"Edgar Allan Poe",
"Complete Poetical Works of Edgar Allan Poe",
"18271849",
)
if p:
poems.append(p)
_save_current()
return poems

@ -0,0 +1 @@
Subproject commit 7fd16c08b20d81da497d2efb44af2e83860382f4

File diff suppressed because one or more lines are too long
Loading…
Cancel
Save