Yeah baby
This commit is contained in:
127
inspect_snapshot.py
Normal file
127
inspect_snapshot.py
Normal file
@@ -0,0 +1,127 @@
|
||||
# inspect_snapshot.py
|
||||
import argparse
|
||||
import csv
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import List, Any
|
||||
|
||||
def load_json(p: Path) -> dict:
|
||||
return json.loads(p.read_text(encoding="utf-8"))
|
||||
|
||||
def find_table_files(snapshot_dir: Path) -> List[Path]:
|
||||
tables_dir = snapshot_dir / "tables"
|
||||
if not tables_dir.exists():
|
||||
raise SystemExit(f"Missing tables/ folder in: {snapshot_dir}")
|
||||
return sorted(tables_dir.glob("table_*.json"))
|
||||
|
||||
def list_tables(snapshot_dir: Path) -> None:
|
||||
meta_path = snapshot_dir / "meta.json"
|
||||
if not meta_path.exists():
|
||||
raise SystemExit(f"Missing meta.json in: {snapshot_dir}")
|
||||
|
||||
meta = load_json(meta_path)
|
||||
print(f"\nSnapshot: {snapshot_dir}")
|
||||
print(f"Title: {meta.get('title')}")
|
||||
print(f"URL: {meta.get('url')}")
|
||||
print(f"Captured: {meta.get('captured_at')}")
|
||||
print(f"Tables: {meta.get('table_count')}\n")
|
||||
|
||||
tables_dir = snapshot_dir / "tables"
|
||||
for t in meta.get("tables", []):
|
||||
idx = int(t["index"])
|
||||
cap = t.get("caption") or "(no caption)"
|
||||
tjson = tables_dir / f"table_{idx:02d}.json"
|
||||
|
||||
if not tjson.exists():
|
||||
print(f"[{idx:02d}] {cap} (missing file)")
|
||||
continue
|
||||
|
||||
data = load_json(tjson)
|
||||
rows = data.get("rows", [])
|
||||
r = len(rows)
|
||||
c = max((len(row) for row in rows), default=0)
|
||||
|
||||
# quick preview of row 0 (often header)
|
||||
preview = ""
|
||||
if rows:
|
||||
preview = " | ".join(rows[0][:6])
|
||||
if len(preview) > 100:
|
||||
preview = preview[:100] + "…"
|
||||
|
||||
print(f"[{idx:02d}] {cap} ({r}x{c})")
|
||||
if preview:
|
||||
print(f" {preview}")
|
||||
|
||||
def print_table(snapshot_dir: Path, idx: int, limit_rows: int = 60, limit_cols: int = 14) -> List[List[str]]:
|
||||
tjson = snapshot_dir / "tables" / f"table_{idx:02d}.json"
|
||||
if not tjson.exists():
|
||||
raise SystemExit(f"Table not found: {tjson}")
|
||||
|
||||
data = load_json(tjson)
|
||||
caption = data.get("caption") or "(no caption)"
|
||||
rows: List[List[str]] = data.get("rows", [])
|
||||
|
||||
print(f"\nTable [{idx:02d}] — {caption}\n")
|
||||
|
||||
if not rows:
|
||||
print("(empty)")
|
||||
return rows
|
||||
|
||||
cols = min(max((len(r) for r in rows), default=0), limit_cols)
|
||||
|
||||
# compute column widths
|
||||
widths = [0] * cols
|
||||
for r in rows[:limit_rows]:
|
||||
for j in range(cols):
|
||||
cell = r[j] if j < len(r) else ""
|
||||
widths[j] = min(34, max(widths[j], len(cell)))
|
||||
|
||||
def fmt_row(r: List[str]) -> str:
|
||||
out = []
|
||||
for j in range(cols):
|
||||
cell = r[j] if j < len(r) else ""
|
||||
cell = cell.replace("\n", " ")
|
||||
if len(cell) > 33:
|
||||
cell = cell[:32] + "…"
|
||||
out.append(cell.ljust(widths[j]))
|
||||
return " | ".join(out)
|
||||
|
||||
for i, r in enumerate(rows[:limit_rows]):
|
||||
print(fmt_row(r))
|
||||
if i == 0:
|
||||
print("-" * min(160, sum(widths) + 3 * (cols - 1)))
|
||||
|
||||
if len(rows) > limit_rows:
|
||||
print(f"\n…({len(rows) - limit_rows} more rows)")
|
||||
|
||||
return rows
|
||||
|
||||
def export_csv(rows: List[List[Any]], out_path: Path) -> None:
|
||||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with out_path.open("w", newline="", encoding="utf-8") as f:
|
||||
w = csv.writer(f)
|
||||
for r in rows:
|
||||
w.writerow(r)
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument("snapshot_dir", help="Path to a snapshot folder (has meta.json + tables/)")
|
||||
ap.add_argument("--table", type=int, help="Print a specific table index (e.g. 5)")
|
||||
ap.add_argument("--csv", help="Export printed table to CSV at this path")
|
||||
ap.add_argument("--rows", type=int, default=60, help="Row print limit (default 60)")
|
||||
ap.add_argument("--cols", type=int, default=14, help="Col print limit (default 14)")
|
||||
args = ap.parse_args()
|
||||
|
||||
snapshot_dir = Path(args.snapshot_dir).expanduser().resolve()
|
||||
|
||||
if args.table is None:
|
||||
list_tables(snapshot_dir)
|
||||
return
|
||||
|
||||
rows = print_table(snapshot_dir, args.table, limit_rows=args.rows, limit_cols=args.cols)
|
||||
if args.csv:
|
||||
export_csv(rows, Path(args.csv).expanduser().resolve())
|
||||
print(f"\n✅ CSV written: {args.csv}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user