# inspect_snapshot.py import argparse import csv import json from pathlib import Path from typing import List, Any def load_json(p: Path) -> dict: return json.loads(p.read_text(encoding="utf-8")) def find_table_files(snapshot_dir: Path) -> List[Path]: tables_dir = snapshot_dir / "tables" if not tables_dir.exists(): raise SystemExit(f"Missing tables/ folder in: {snapshot_dir}") return sorted(tables_dir.glob("table_*.json")) def list_tables(snapshot_dir: Path) -> None: meta_path = snapshot_dir / "meta.json" if not meta_path.exists(): raise SystemExit(f"Missing meta.json in: {snapshot_dir}") meta = load_json(meta_path) print(f"\nSnapshot: {snapshot_dir}") print(f"Title: {meta.get('title')}") print(f"URL: {meta.get('url')}") print(f"Captured: {meta.get('captured_at')}") print(f"Tables: {meta.get('table_count')}\n") tables_dir = snapshot_dir / "tables" for t in meta.get("tables", []): idx = int(t["index"]) cap = t.get("caption") or "(no caption)" tjson = tables_dir / f"table_{idx:02d}.json" if not tjson.exists(): print(f"[{idx:02d}] {cap} (missing file)") continue data = load_json(tjson) rows = data.get("rows", []) r = len(rows) c = max((len(row) for row in rows), default=0) # quick preview of row 0 (often header) preview = "" if rows: preview = " | ".join(rows[0][:6]) if len(preview) > 100: preview = preview[:100] + "…" print(f"[{idx:02d}] {cap} ({r}x{c})") if preview: print(f" {preview}") def print_table(snapshot_dir: Path, idx: int, limit_rows: int = 60, limit_cols: int = 14) -> List[List[str]]: tjson = snapshot_dir / "tables" / f"table_{idx:02d}.json" if not tjson.exists(): raise SystemExit(f"Table not found: {tjson}") data = load_json(tjson) caption = data.get("caption") or "(no caption)" rows: List[List[str]] = data.get("rows", []) print(f"\nTable [{idx:02d}] — {caption}\n") if not rows: print("(empty)") return rows cols = min(max((len(r) for r in rows), default=0), limit_cols) # compute column widths widths = [0] * cols for r in rows[:limit_rows]: for j in range(cols): cell = r[j] if j < len(r) else "" widths[j] = min(34, max(widths[j], len(cell))) def fmt_row(r: List[str]) -> str: out = [] for j in range(cols): cell = r[j] if j < len(r) else "" cell = cell.replace("\n", " ") if len(cell) > 33: cell = cell[:32] + "…" out.append(cell.ljust(widths[j])) return " | ".join(out) for i, r in enumerate(rows[:limit_rows]): print(fmt_row(r)) if i == 0: print("-" * min(160, sum(widths) + 3 * (cols - 1))) if len(rows) > limit_rows: print(f"\n…({len(rows) - limit_rows} more rows)") return rows def export_csv(rows: List[List[Any]], out_path: Path) -> None: out_path.parent.mkdir(parents=True, exist_ok=True) with out_path.open("w", newline="", encoding="utf-8") as f: w = csv.writer(f) for r in rows: w.writerow(r) def main(): ap = argparse.ArgumentParser() ap.add_argument("snapshot_dir", help="Path to a snapshot folder (has meta.json + tables/)") ap.add_argument("--table", type=int, help="Print a specific table index (e.g. 5)") ap.add_argument("--csv", help="Export printed table to CSV at this path") ap.add_argument("--rows", type=int, default=60, help="Row print limit (default 60)") ap.add_argument("--cols", type=int, default=14, help="Col print limit (default 14)") args = ap.parse_args() snapshot_dir = Path(args.snapshot_dir).expanduser().resolve() if args.table is None: list_tables(snapshot_dir) return rows = print_table(snapshot_dir, args.table, limit_rows=args.rows, limit_cols=args.cols) if args.csv: export_csv(rows, Path(args.csv).expanduser().resolve()) print(f"\n✅ CSV written: {args.csv}") if __name__ == "__main__": main()