Yeah baby

This commit is contained in:
Ben Mosley
2026-02-25 12:25:28 -06:00
commit e6acecd5b5
3 changed files with 464 additions and 0 deletions

127
inspect_snapshot.py Normal file
View File

@@ -0,0 +1,127 @@
# inspect_snapshot.py
import argparse
import csv
import json
from pathlib import Path
from typing import List, Any
def load_json(p: Path) -> dict:
return json.loads(p.read_text(encoding="utf-8"))
def find_table_files(snapshot_dir: Path) -> List[Path]:
tables_dir = snapshot_dir / "tables"
if not tables_dir.exists():
raise SystemExit(f"Missing tables/ folder in: {snapshot_dir}")
return sorted(tables_dir.glob("table_*.json"))
def list_tables(snapshot_dir: Path) -> None:
meta_path = snapshot_dir / "meta.json"
if not meta_path.exists():
raise SystemExit(f"Missing meta.json in: {snapshot_dir}")
meta = load_json(meta_path)
print(f"\nSnapshot: {snapshot_dir}")
print(f"Title: {meta.get('title')}")
print(f"URL: {meta.get('url')}")
print(f"Captured: {meta.get('captured_at')}")
print(f"Tables: {meta.get('table_count')}\n")
tables_dir = snapshot_dir / "tables"
for t in meta.get("tables", []):
idx = int(t["index"])
cap = t.get("caption") or "(no caption)"
tjson = tables_dir / f"table_{idx:02d}.json"
if not tjson.exists():
print(f"[{idx:02d}] {cap} (missing file)")
continue
data = load_json(tjson)
rows = data.get("rows", [])
r = len(rows)
c = max((len(row) for row in rows), default=0)
# quick preview of row 0 (often header)
preview = ""
if rows:
preview = " | ".join(rows[0][:6])
if len(preview) > 100:
preview = preview[:100] + ""
print(f"[{idx:02d}] {cap} ({r}x{c})")
if preview:
print(f" {preview}")
def print_table(snapshot_dir: Path, idx: int, limit_rows: int = 60, limit_cols: int = 14) -> List[List[str]]:
tjson = snapshot_dir / "tables" / f"table_{idx:02d}.json"
if not tjson.exists():
raise SystemExit(f"Table not found: {tjson}")
data = load_json(tjson)
caption = data.get("caption") or "(no caption)"
rows: List[List[str]] = data.get("rows", [])
print(f"\nTable [{idx:02d}] — {caption}\n")
if not rows:
print("(empty)")
return rows
cols = min(max((len(r) for r in rows), default=0), limit_cols)
# compute column widths
widths = [0] * cols
for r in rows[:limit_rows]:
for j in range(cols):
cell = r[j] if j < len(r) else ""
widths[j] = min(34, max(widths[j], len(cell)))
def fmt_row(r: List[str]) -> str:
out = []
for j in range(cols):
cell = r[j] if j < len(r) else ""
cell = cell.replace("\n", " ")
if len(cell) > 33:
cell = cell[:32] + ""
out.append(cell.ljust(widths[j]))
return " | ".join(out)
for i, r in enumerate(rows[:limit_rows]):
print(fmt_row(r))
if i == 0:
print("-" * min(160, sum(widths) + 3 * (cols - 1)))
if len(rows) > limit_rows:
print(f"\n…({len(rows) - limit_rows} more rows)")
return rows
def export_csv(rows: List[List[Any]], out_path: Path) -> None:
out_path.parent.mkdir(parents=True, exist_ok=True)
with out_path.open("w", newline="", encoding="utf-8") as f:
w = csv.writer(f)
for r in rows:
w.writerow(r)
def main():
ap = argparse.ArgumentParser()
ap.add_argument("snapshot_dir", help="Path to a snapshot folder (has meta.json + tables/)")
ap.add_argument("--table", type=int, help="Print a specific table index (e.g. 5)")
ap.add_argument("--csv", help="Export printed table to CSV at this path")
ap.add_argument("--rows", type=int, default=60, help="Row print limit (default 60)")
ap.add_argument("--cols", type=int, default=14, help="Col print limit (default 14)")
args = ap.parse_args()
snapshot_dir = Path(args.snapshot_dir).expanduser().resolve()
if args.table is None:
list_tables(snapshot_dir)
return
rows = print_table(snapshot_dir, args.table, limit_rows=args.rows, limit_cols=args.cols)
if args.csv:
export_csv(rows, Path(args.csv).expanduser().resolve())
print(f"\n✅ CSV written: {args.csv}")
if __name__ == "__main__":
main()