Add generate_web_data.py for rebuilding web UI JSON

Script combines letters/*.json and poetry/*.json into the combined
data files used by hicalsoft.github.io web pages. Supports --letters
and --poetry flags for selective regeneration.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
main
Ibraheem Saleh 1 week ago
parent 6b308d3a01
commit 063f96287f

@ -9,6 +9,7 @@ A Python project that downloads, parses, and displays historic love letters and
```
├── download_letters.py # Downloads & parses 11 letter sources from Gutenberg
├── download_poetry.py # Downloads & parses 15 poetry sources from Gutenberg
├── generate_web_data.py # Combines JSON files into web UI data (letters.json / poetry.json)
├── love_letters.py # CLI app: displays random letters in the terminal
├── letters/ # Pre-parsed letter JSON files (11 sources, ~1,307 letters)
├── poetry/ # Pre-parsed poetry JSON files (15 sources, ~3,098 poems)
@ -40,25 +41,32 @@ python3 love_letters.py --source napoleon # Filter by source
1. **Download scripts** fetch raw `.txt` files from Project Gutenberg via `urllib`
2. Each source has a **custom extractor function** that parses the Gutenberg text format
3. Parsed data is saved as JSON to `letters/` or `poetry/` directories
4. A separate step combines the individual JSON files into `letters.json` / `poetry.json` for the web UI (these live in `hicalsoft.github.io/*/data/`)
### Regenerating web UI data
```python
# Letters — run from project root
import json, os, glob
out = {"authors": {}, "letters": []}
for f in sorted(glob.glob("letters/*.json")):
data = json.load(open(f))
for l in data:
out["letters"].append({"a": l["author"], "r": l["recipient"], "h": l.get("heading",""), "b": l["body"], "s": l["source"], "p": l.get("period","")})
out["authors"].setdefault(l["author"], 0)
out["authors"][l["author"]] += 1
json.dump(out, open("hicalsoft.github.io/letters/data/letters.json","w"), separators=(",",":"))
# Poetry — same pattern with {authors, poems} structure
4. `generate_web_data.py` combines the individual JSON files into `letters.json` / `poetry.json` for the web UI
### Adding new sources or regenerating data
```bash
# Step 1: Download/parse source data (requires internet)
python3 download_letters.py # Re-download all letter sources
python3 download_poetry.py # Re-download all poetry sources
# Step 2: Regenerate web UI JSON (no internet needed)
python3 generate_web_data.py # Both letters + poetry
python3 generate_web_data.py --letters # Letters only
python3 generate_web_data.py --poetry # Poetry only
# Step 3: Commit changes to both repos
git add -A && git commit -m "Update data"
cd hicalsoft.github.io && git add -A && git commit -m "Update data"
```
To add a new letter or poetry source:
1. Add an extractor function to `download_letters.py` or `download_poetry.py`
2. Add the source to the `SOURCES` list in that file's `main()`
3. Run the download script to generate the new JSON in `letters/` or `poetry/`
4. Run `python3 generate_web_data.py` to rebuild the combined web UI data
5. Commit to both repositories
## Gutenberg Parsing Notes
- **Line endings**: Always normalize with `.replace("\r\n", "\n").replace("\r", "\n")` before regex splitting

@ -0,0 +1,114 @@
#!/usr/bin/env python3
"""Generate combined JSON data files for the hicalsoft.github.io web UI.
Reads individual JSON files from letters/ and poetry/ directories and
produces the combined letters.json and poetry.json used by the web pages.
Usage:
python3 generate_web_data.py # Generate both
python3 generate_web_data.py --letters # Letters only
python3 generate_web_data.py --poetry # Poetry only
"""
import argparse
import glob
import json
import os
import sys
LETTERS_SRC = "letters"
POETRY_SRC = "poetry"
LETTERS_OUT = os.path.join("hicalsoft.github.io", "letters", "data", "letters.json")
POETRY_OUT = os.path.join("hicalsoft.github.io", "poetry", "data", "poetry.json")
def generate_letters():
"""Combine individual letter JSON files into one for the web UI."""
letters = []
for f in sorted(glob.glob(os.path.join(LETTERS_SRC, "*.json"))):
with open(f) as fh:
data = json.load(fh)
for item in data:
letters.append({
"h": item.get("heading", ""),
"b": item["body"],
"a": item["author"],
"r": item.get("recipient", ""),
"s": item["source"],
"p": item.get("period", ""),
})
authors = {}
for l in letters:
authors[l["a"]] = authors.get(l["a"], 0) + 1
out = {"authors": authors, "letters": letters}
os.makedirs(os.path.dirname(LETTERS_OUT), exist_ok=True)
with open(LETTERS_OUT, "w") as f:
json.dump(out, f, separators=(",", ":"))
size_mb = os.path.getsize(LETTERS_OUT) / 1024 / 1024
print(f"Letters: {len(letters)} letters from {len(authors)} authors ({size_mb:.2f} MB)")
print(f"{LETTERS_OUT}")
def generate_poetry():
"""Combine individual poetry JSON files into one for the web UI."""
poems = []
for f in sorted(glob.glob(os.path.join(POETRY_SRC, "*.json"))):
with open(f) as fh:
data = json.load(fh)
for item in data:
poems.append({
"t": item.get("title", ""),
"b": item["body"],
"a": item["author"],
"s": item["source"],
"p": item.get("period", ""),
})
authors = {}
for p in poems:
authors[p["a"]] = authors.get(p["a"], 0) + 1
out = {"authors": authors, "poems": poems}
os.makedirs(os.path.dirname(POETRY_OUT), exist_ok=True)
with open(POETRY_OUT, "w") as f:
json.dump(out, f, separators=(",", ":"))
size_mb = os.path.getsize(POETRY_OUT) / 1024 / 1024
print(f"Poetry: {len(poems)} poems from {len(authors)} poets ({size_mb:.2f} MB)")
print(f"{POETRY_OUT}")
def main():
parser = argparse.ArgumentParser(
description="Generate combined JSON data for the hicalsoft.github.io web UI."
)
parser.add_argument("--letters", action="store_true", help="Generate letters.json only")
parser.add_argument("--poetry", action="store_true", help="Generate poetry.json only")
args = parser.parse_args()
# If neither flag is set, generate both
do_letters = args.letters or not (args.letters or args.poetry)
do_poetry = args.poetry or not (args.letters or args.poetry)
if do_letters:
if not os.path.isdir(LETTERS_SRC):
print(f"Error: {LETTERS_SRC}/ directory not found. Run download_letters.py first.",
file=sys.stderr)
sys.exit(1)
generate_letters()
if do_poetry:
if not os.path.isdir(POETRY_SRC):
print(f"Error: {POETRY_SRC}/ directory not found. Run download_poetry.py first.",
file=sys.stderr)
sys.exit(1)
generate_poetry()
if __name__ == "__main__":
main()

@ -1 +1 @@
Subproject commit c292f8eed1677afa7de015a8a32098f7b3f52956
Subproject commit 2029d6f5d701f4492901fb6e58789714e687ad2f
Loading…
Cancel
Save