diff --git a/CLAUDE.md b/CLAUDE.md index 4853422..313ab59 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -9,6 +9,7 @@ A Python project that downloads, parses, and displays historic love letters and ``` ├── download_letters.py # Downloads & parses 11 letter sources from Gutenberg ├── download_poetry.py # Downloads & parses 15 poetry sources from Gutenberg +├── generate_web_data.py # Combines JSON files into web UI data (letters.json / poetry.json) ├── love_letters.py # CLI app: displays random letters in the terminal ├── letters/ # Pre-parsed letter JSON files (11 sources, ~1,307 letters) ├── poetry/ # Pre-parsed poetry JSON files (15 sources, ~3,098 poems) @@ -40,25 +41,32 @@ python3 love_letters.py --source napoleon # Filter by source 1. **Download scripts** fetch raw `.txt` files from Project Gutenberg via `urllib` 2. Each source has a **custom extractor function** that parses the Gutenberg text format 3. Parsed data is saved as JSON to `letters/` or `poetry/` directories -4. A separate step combines the individual JSON files into `letters.json` / `poetry.json` for the web UI (these live in `hicalsoft.github.io/*/data/`) - -### Regenerating web UI data - -```python -# Letters — run from project root -import json, os, glob -out = {"authors": {}, "letters": []} -for f in sorted(glob.glob("letters/*.json")): - data = json.load(open(f)) - for l in data: - out["letters"].append({"a": l["author"], "r": l["recipient"], "h": l.get("heading",""), "b": l["body"], "s": l["source"], "p": l.get("period","")}) - out["authors"].setdefault(l["author"], 0) - out["authors"][l["author"]] += 1 -json.dump(out, open("hicalsoft.github.io/letters/data/letters.json","w"), separators=(",",":")) - -# Poetry — same pattern with {authors, poems} structure +4. `generate_web_data.py` combines the individual JSON files into `letters.json` / `poetry.json` for the web UI + +### Adding new sources or regenerating data + +```bash +# Step 1: Download/parse source data (requires internet) +python3 download_letters.py # Re-download all letter sources +python3 download_poetry.py # Re-download all poetry sources + +# Step 2: Regenerate web UI JSON (no internet needed) +python3 generate_web_data.py # Both letters + poetry +python3 generate_web_data.py --letters # Letters only +python3 generate_web_data.py --poetry # Poetry only + +# Step 3: Commit changes to both repos +git add -A && git commit -m "Update data" +cd hicalsoft.github.io && git add -A && git commit -m "Update data" ``` +To add a new letter or poetry source: +1. Add an extractor function to `download_letters.py` or `download_poetry.py` +2. Add the source to the `SOURCES` list in that file's `main()` +3. Run the download script to generate the new JSON in `letters/` or `poetry/` +4. Run `python3 generate_web_data.py` to rebuild the combined web UI data +5. Commit to both repositories + ## Gutenberg Parsing Notes - **Line endings**: Always normalize with `.replace("\r\n", "\n").replace("\r", "\n")` before regex splitting diff --git a/generate_web_data.py b/generate_web_data.py new file mode 100644 index 0000000..81229c8 --- /dev/null +++ b/generate_web_data.py @@ -0,0 +1,114 @@ +#!/usr/bin/env python3 +"""Generate combined JSON data files for the hicalsoft.github.io web UI. + +Reads individual JSON files from letters/ and poetry/ directories and +produces the combined letters.json and poetry.json used by the web pages. + +Usage: + python3 generate_web_data.py # Generate both + python3 generate_web_data.py --letters # Letters only + python3 generate_web_data.py --poetry # Poetry only +""" + +import argparse +import glob +import json +import os +import sys + +LETTERS_SRC = "letters" +POETRY_SRC = "poetry" +LETTERS_OUT = os.path.join("hicalsoft.github.io", "letters", "data", "letters.json") +POETRY_OUT = os.path.join("hicalsoft.github.io", "poetry", "data", "poetry.json") + + +def generate_letters(): + """Combine individual letter JSON files into one for the web UI.""" + letters = [] + for f in sorted(glob.glob(os.path.join(LETTERS_SRC, "*.json"))): + with open(f) as fh: + data = json.load(fh) + for item in data: + letters.append({ + "h": item.get("heading", ""), + "b": item["body"], + "a": item["author"], + "r": item.get("recipient", ""), + "s": item["source"], + "p": item.get("period", ""), + }) + + authors = {} + for l in letters: + authors[l["a"]] = authors.get(l["a"], 0) + 1 + + out = {"authors": authors, "letters": letters} + + os.makedirs(os.path.dirname(LETTERS_OUT), exist_ok=True) + with open(LETTERS_OUT, "w") as f: + json.dump(out, f, separators=(",", ":")) + + size_mb = os.path.getsize(LETTERS_OUT) / 1024 / 1024 + print(f"Letters: {len(letters)} letters from {len(authors)} authors ({size_mb:.2f} MB)") + print(f" → {LETTERS_OUT}") + + +def generate_poetry(): + """Combine individual poetry JSON files into one for the web UI.""" + poems = [] + for f in sorted(glob.glob(os.path.join(POETRY_SRC, "*.json"))): + with open(f) as fh: + data = json.load(fh) + for item in data: + poems.append({ + "t": item.get("title", ""), + "b": item["body"], + "a": item["author"], + "s": item["source"], + "p": item.get("period", ""), + }) + + authors = {} + for p in poems: + authors[p["a"]] = authors.get(p["a"], 0) + 1 + + out = {"authors": authors, "poems": poems} + + os.makedirs(os.path.dirname(POETRY_OUT), exist_ok=True) + with open(POETRY_OUT, "w") as f: + json.dump(out, f, separators=(",", ":")) + + size_mb = os.path.getsize(POETRY_OUT) / 1024 / 1024 + print(f"Poetry: {len(poems)} poems from {len(authors)} poets ({size_mb:.2f} MB)") + print(f" → {POETRY_OUT}") + + +def main(): + parser = argparse.ArgumentParser( + description="Generate combined JSON data for the hicalsoft.github.io web UI." + ) + parser.add_argument("--letters", action="store_true", help="Generate letters.json only") + parser.add_argument("--poetry", action="store_true", help="Generate poetry.json only") + args = parser.parse_args() + + # If neither flag is set, generate both + do_letters = args.letters or not (args.letters or args.poetry) + do_poetry = args.poetry or not (args.letters or args.poetry) + + if do_letters: + if not os.path.isdir(LETTERS_SRC): + print(f"Error: {LETTERS_SRC}/ directory not found. Run download_letters.py first.", + file=sys.stderr) + sys.exit(1) + generate_letters() + + if do_poetry: + if not os.path.isdir(POETRY_SRC): + print(f"Error: {POETRY_SRC}/ directory not found. Run download_poetry.py first.", + file=sys.stderr) + sys.exit(1) + generate_poetry() + + +if __name__ == "__main__": + main() diff --git a/hicalsoft.github.io b/hicalsoft.github.io index c292f8e..2029d6f 160000 --- a/hicalsoft.github.io +++ b/hicalsoft.github.io @@ -1 +1 @@ -Subproject commit c292f8eed1677afa7de015a8a32098f7b3f52956 +Subproject commit 2029d6f5d701f4492901fb6e58789714e687ad2f