Fix Poe parser and add font size controls

- Rewrite Poe extractor with section tracking: 51 clean poems
  (was 108 with junk notes, prose, and dividers)
- Skip memoir, notes, prose poems, essays, dedications
- Properly extract from all 4 poem sections

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
main
Ibraheem Saleh 1 week ago
parent e4464b6773
commit 49d11e8869

@ -412,82 +412,112 @@ def extract_poe(text):
"""Complete Poetical Works of Edgar Allan Poe (Gutenberg 10031).""" """Complete Poetical Works of Edgar Allan Poe (Gutenberg 10031)."""
body = extract_body(text) body = extract_body(text)
poems = [] poems = []
lines_list = body.split("\n")
# Find the start of actual poems (after intro/contents)
# Poems have titles in ALL CAPS separated by blank lines # Top-level sections containing actual poems
lines = body.split("\n") poem_sections = {"POEMS OF LATER LIFE", "POEMS OF MANHOOD",
"POEMS OF YOUTH", "DOUBTFUL POEMS"}
# Top-level sections that are NOT poems
non_poem_sections = {"CONTENTS", "TABLE OF CONTENTS", "NOTES", "NOTE",
"MEMOIR", "MEMOIR OF EDGAR ALLAN POE",
"PROSE POEMS", "ESSAYS", "NOTE ON POLITIAN",
'SCENES FROM "POLITIAN"'}
# Sub-headings within poem sections to skip
skip_titles = {"PREFACE", "INTRODUCTION TO POEMS1831",
"LETTER TO MR B", "JOHN H INGRAM",
"THE NOBLEST OF HER SEX",
"MISS ELIZABETH BARRETT BARRETT",
"OF ENGLAND", "I DEDICATE THIS VOLUME",
"WITH THE MOST ENTHUSIASTIC ADMIRATION AND",
"WEST POINT 1831", "DEAR B"}
in_poem_section = False
current_title = "" current_title = ""
current_body_lines = [] current_body_lines = []
skip_sections = {"CONTENTS", "NOTE", "NOTES", "INDEX", "APPENDIX",
"MEMOIR", "PREFACE", "INTRODUCTION", "BIBLIOGRAPHY"}
i = 0 def _is_title(stripped, idx):
while i < len(lines): return (stripped and
line = lines[i]
stripped = line.strip()
# Title detection: short ALL CAPS line after blank line
if (stripped and
len(stripped) < 60 and len(stripped) < 60 and
stripped == stripped.upper() and stripped == stripped.upper() and
re.search(r"[A-Z]{3,}", stripped) and re.search(r"[A-Z]{3,}", stripped) and
not stripped.startswith("[") and not stripped.startswith("[") and
not stripped.startswith("BY ") and not stripped.startswith("BY ") and
not re.match(r"^\d+$", stripped) and not re.match(r"^\d+$", stripped) and
not re.match(r"^\d+[.\s]", stripped) and
not re.match(r"^[IVXLC]+\.$", stripped) and
not stripped.startswith("***") and not stripped.startswith("***") and
i > 0 and not lines[i-1].strip()): idx > 0 and not lines_list[idx - 1].strip())
# Check if this is a section to skip def _save_current():
if stripped.split(".")[0].strip() in skip_sections:
if current_title and current_body_lines: if current_title and current_body_lines:
p = make_poem( body_text = "\n".join(current_body_lines)
current_title, cleaned = body_text.strip()
"\n".join(current_body_lines), if (not re.match(r"^[\s*]+$", cleaned)
and len(cleaned) >= 50):
p = make_poem(current_title, body_text,
"Edgar Allan Poe", "Edgar Allan Poe",
"Complete Poetical Works of Edgar Allan Poe", "Complete Poetical Works of Edgar Allan Poe",
"18271849", "1827\u20131849")
)
if p: if p:
poems.append(p) poems.append(p)
def _norm(s):
return re.sub(r"[.,:;\"\'-]", "", s).strip()
i = 0
while i < len(lines_list):
line = lines_list[i]
stripped = line.strip()
if _is_title(stripped, i):
clean = stripped.rstrip(".,:;").rstrip()
normed = _norm(stripped)
# Check for poem section header
if clean in poem_sections or normed in {_norm(s) for s in poem_sections}:
_save_current()
in_poem_section = True
current_title = "" current_title = ""
current_body_lines = [] current_body_lines = []
i += 1 i += 1
continue continue
# Save previous poem # Check for non-poem section header
if current_title and current_body_lines: if clean in non_poem_sections or normed in {_norm(s) for s in non_poem_sections}:
p = make_poem( _save_current()
current_title, in_poem_section = False
"\n".join(current_body_lines), current_title = ""
"Edgar Allan Poe", current_body_lines = []
"Complete Poetical Works of Edgar Allan Poe", i += 1
"18271849", continue
)
if p:
poems.append(p)
current_title = stripped.title() # Within a poem section
if in_poem_section:
# Skip certain sub-headings without starting a poem
if normed in {_norm(s) for s in skip_titles}:
_save_current()
current_title = ""
current_body_lines = [] current_body_lines = []
i += 1 i += 1
continue continue
if current_title: # New poem title
_save_current()
title = stripped.title()
# Give "Part I" / "Part Ii" proper names
if re.match(r"Part [Ii]+\.", title):
title = "Al Aaraaf — " + title
current_title = title
current_body_lines = []
i += 1
continue
if in_poem_section and current_title:
current_body_lines.append(line) current_body_lines.append(line)
i += 1 i += 1
if current_title and current_body_lines: _save_current()
p = make_poem(
current_title,
"\n".join(current_body_lines),
"Edgar Allan Poe",
"Complete Poetical Works of Edgar Allan Poe",
"18271849",
)
if p:
poems.append(p)
return poems return poems

@ -0,0 +1 @@
Subproject commit 7fd16c08b20d81da497d2efb44af2e83860382f4

File diff suppressed because one or more lines are too long
Loading…
Cancel
Save