|
|
|
|
@ -412,82 +412,112 @@ def extract_poe(text):
|
|
|
|
|
"""Complete Poetical Works of Edgar Allan Poe (Gutenberg 10031)."""
|
|
|
|
|
body = extract_body(text)
|
|
|
|
|
poems = []
|
|
|
|
|
|
|
|
|
|
# Find the start of actual poems (after intro/contents)
|
|
|
|
|
# Poems have titles in ALL CAPS separated by blank lines
|
|
|
|
|
lines = body.split("\n")
|
|
|
|
|
|
|
|
|
|
lines_list = body.split("\n")
|
|
|
|
|
|
|
|
|
|
# Top-level sections containing actual poems
|
|
|
|
|
poem_sections = {"POEMS OF LATER LIFE", "POEMS OF MANHOOD",
|
|
|
|
|
"POEMS OF YOUTH", "DOUBTFUL POEMS"}
|
|
|
|
|
# Top-level sections that are NOT poems
|
|
|
|
|
non_poem_sections = {"CONTENTS", "TABLE OF CONTENTS", "NOTES", "NOTE",
|
|
|
|
|
"MEMOIR", "MEMOIR OF EDGAR ALLAN POE",
|
|
|
|
|
"PROSE POEMS", "ESSAYS", "NOTE ON POLITIAN",
|
|
|
|
|
'SCENES FROM "POLITIAN"'}
|
|
|
|
|
# Sub-headings within poem sections to skip
|
|
|
|
|
skip_titles = {"PREFACE", "INTRODUCTION TO POEMS1831",
|
|
|
|
|
"LETTER TO MR B", "JOHN H INGRAM",
|
|
|
|
|
"THE NOBLEST OF HER SEX",
|
|
|
|
|
"MISS ELIZABETH BARRETT BARRETT",
|
|
|
|
|
"OF ENGLAND", "I DEDICATE THIS VOLUME",
|
|
|
|
|
"WITH THE MOST ENTHUSIASTIC ADMIRATION AND",
|
|
|
|
|
"WEST POINT 1831", "DEAR B"}
|
|
|
|
|
|
|
|
|
|
in_poem_section = False
|
|
|
|
|
current_title = ""
|
|
|
|
|
current_body_lines = []
|
|
|
|
|
skip_sections = {"CONTENTS", "NOTE", "NOTES", "INDEX", "APPENDIX",
|
|
|
|
|
"MEMOIR", "PREFACE", "INTRODUCTION", "BIBLIOGRAPHY"}
|
|
|
|
|
|
|
|
|
|
i = 0
|
|
|
|
|
while i < len(lines):
|
|
|
|
|
line = lines[i]
|
|
|
|
|
stripped = line.strip()
|
|
|
|
|
|
|
|
|
|
# Title detection: short ALL CAPS line after blank line
|
|
|
|
|
if (stripped and
|
|
|
|
|
def _is_title(stripped, idx):
|
|
|
|
|
return (stripped and
|
|
|
|
|
len(stripped) < 60 and
|
|
|
|
|
stripped == stripped.upper() and
|
|
|
|
|
re.search(r"[A-Z]{3,}", stripped) and
|
|
|
|
|
not stripped.startswith("[") and
|
|
|
|
|
not stripped.startswith("BY ") and
|
|
|
|
|
not re.match(r"^\d+$", stripped) and
|
|
|
|
|
not re.match(r"^\d+[.\s]", stripped) and
|
|
|
|
|
not re.match(r"^[IVXLC]+\.$", stripped) and
|
|
|
|
|
not stripped.startswith("***") and
|
|
|
|
|
i > 0 and not lines[i-1].strip()):
|
|
|
|
|
idx > 0 and not lines_list[idx - 1].strip())
|
|
|
|
|
|
|
|
|
|
# Check if this is a section to skip
|
|
|
|
|
if stripped.split(".")[0].strip() in skip_sections:
|
|
|
|
|
def _save_current():
|
|
|
|
|
if current_title and current_body_lines:
|
|
|
|
|
p = make_poem(
|
|
|
|
|
current_title,
|
|
|
|
|
"\n".join(current_body_lines),
|
|
|
|
|
body_text = "\n".join(current_body_lines)
|
|
|
|
|
cleaned = body_text.strip()
|
|
|
|
|
if (not re.match(r"^[\s*]+$", cleaned)
|
|
|
|
|
and len(cleaned) >= 50):
|
|
|
|
|
p = make_poem(current_title, body_text,
|
|
|
|
|
"Edgar Allan Poe",
|
|
|
|
|
"Complete Poetical Works of Edgar Allan Poe",
|
|
|
|
|
"1827–1849",
|
|
|
|
|
)
|
|
|
|
|
"1827\u20131849")
|
|
|
|
|
if p:
|
|
|
|
|
poems.append(p)
|
|
|
|
|
|
|
|
|
|
def _norm(s):
|
|
|
|
|
return re.sub(r"[.,:;\"\'-]", "", s).strip()
|
|
|
|
|
|
|
|
|
|
i = 0
|
|
|
|
|
while i < len(lines_list):
|
|
|
|
|
line = lines_list[i]
|
|
|
|
|
stripped = line.strip()
|
|
|
|
|
|
|
|
|
|
if _is_title(stripped, i):
|
|
|
|
|
clean = stripped.rstrip(".,:;").rstrip()
|
|
|
|
|
normed = _norm(stripped)
|
|
|
|
|
|
|
|
|
|
# Check for poem section header
|
|
|
|
|
if clean in poem_sections or normed in {_norm(s) for s in poem_sections}:
|
|
|
|
|
_save_current()
|
|
|
|
|
in_poem_section = True
|
|
|
|
|
current_title = ""
|
|
|
|
|
current_body_lines = []
|
|
|
|
|
i += 1
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
# Save previous poem
|
|
|
|
|
if current_title and current_body_lines:
|
|
|
|
|
p = make_poem(
|
|
|
|
|
current_title,
|
|
|
|
|
"\n".join(current_body_lines),
|
|
|
|
|
"Edgar Allan Poe",
|
|
|
|
|
"Complete Poetical Works of Edgar Allan Poe",
|
|
|
|
|
"1827–1849",
|
|
|
|
|
)
|
|
|
|
|
if p:
|
|
|
|
|
poems.append(p)
|
|
|
|
|
# Check for non-poem section header
|
|
|
|
|
if clean in non_poem_sections or normed in {_norm(s) for s in non_poem_sections}:
|
|
|
|
|
_save_current()
|
|
|
|
|
in_poem_section = False
|
|
|
|
|
current_title = ""
|
|
|
|
|
current_body_lines = []
|
|
|
|
|
i += 1
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
current_title = stripped.title()
|
|
|
|
|
# Within a poem section
|
|
|
|
|
if in_poem_section:
|
|
|
|
|
# Skip certain sub-headings without starting a poem
|
|
|
|
|
if normed in {_norm(s) for s in skip_titles}:
|
|
|
|
|
_save_current()
|
|
|
|
|
current_title = ""
|
|
|
|
|
current_body_lines = []
|
|
|
|
|
i += 1
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
if current_title:
|
|
|
|
|
# New poem title
|
|
|
|
|
_save_current()
|
|
|
|
|
title = stripped.title()
|
|
|
|
|
# Give "Part I" / "Part Ii" proper names
|
|
|
|
|
if re.match(r"Part [Ii]+\.", title):
|
|
|
|
|
title = "Al Aaraaf — " + title
|
|
|
|
|
current_title = title
|
|
|
|
|
current_body_lines = []
|
|
|
|
|
i += 1
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
if in_poem_section and current_title:
|
|
|
|
|
current_body_lines.append(line)
|
|
|
|
|
|
|
|
|
|
i += 1
|
|
|
|
|
|
|
|
|
|
if current_title and current_body_lines:
|
|
|
|
|
p = make_poem(
|
|
|
|
|
current_title,
|
|
|
|
|
"\n".join(current_body_lines),
|
|
|
|
|
"Edgar Allan Poe",
|
|
|
|
|
"Complete Poetical Works of Edgar Allan Poe",
|
|
|
|
|
"1827–1849",
|
|
|
|
|
)
|
|
|
|
|
if p:
|
|
|
|
|
poems.append(p)
|
|
|
|
|
|
|
|
|
|
_save_current()
|
|
|
|
|
return poems
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|