Ingest Ethereal 10K worker profiles — domain data in the substrate

10,000 staffing worker profiles from profit/ethereal repo. Flattened
JSON → CSV → Parquet. Indexed on HNSW (9.5s) + Lance IVF_PQ (7.2s).

SQL hybrid verified: forklift operators in IL with reliability > 0.8
returned exact matches. Vector search alone missed the state filter —
confirms the hybrid SQL+vector routing need from quality eval.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
root 2026-04-16 22:26:19 -05:00
parent f9f92706f3
commit a710896db2
4 changed files with 961 additions and 0 deletions

View File

@ -0,0 +1,159 @@
{
"id": "0efdef2d-52bc-4154-82d4-056bb3a1915d",
"name": "ethereal_workers",
"schema_fingerprint": "9a1286ffada5390b459b217f56263e5ebffec1520ea29bab0be2efc6d5381adc",
"objects": [
{
"bucket": "primary",
"key": "datasets/ethereal_workers.parquet",
"size_bytes": 6991716,
"created_at": "2026-04-17T03:23:29.399966984Z"
}
],
"created_at": "2026-04-17T03:23:29.399967772Z",
"updated_at": "2026-04-17T03:23:29.400227081Z",
"description": "",
"owner": "",
"sensitivity": "pii",
"columns": [
{
"name": "worker_id",
"data_type": "Int64",
"sensitivity": null,
"description": "",
"is_pii": false
},
{
"name": "name",
"data_type": "Utf8",
"sensitivity": "pii",
"description": "",
"is_pii": true
},
{
"name": "role",
"data_type": "Utf8",
"sensitivity": null,
"description": "",
"is_pii": false
},
{
"name": "email",
"data_type": "Utf8",
"sensitivity": "pii",
"description": "",
"is_pii": true
},
{
"name": "phone",
"data_type": "Int64",
"sensitivity": "pii",
"description": "",
"is_pii": true
},
{
"name": "city",
"data_type": "Utf8",
"sensitivity": null,
"description": "",
"is_pii": false
},
{
"name": "state",
"data_type": "Utf8",
"sensitivity": null,
"description": "",
"is_pii": false
},
{
"name": "zip",
"data_type": "Int64",
"sensitivity": "pii",
"description": "",
"is_pii": true
},
{
"name": "skills",
"data_type": "Utf8",
"sensitivity": null,
"description": "",
"is_pii": false
},
{
"name": "certifications",
"data_type": "Utf8",
"sensitivity": null,
"description": "",
"is_pii": false
},
{
"name": "archetype",
"data_type": "Utf8",
"sensitivity": null,
"description": "",
"is_pii": false
},
{
"name": "reliability",
"data_type": "Float64",
"sensitivity": null,
"description": "",
"is_pii": false
},
{
"name": "responsiveness",
"data_type": "Float64",
"sensitivity": null,
"description": "",
"is_pii": false
},
{
"name": "engagement",
"data_type": "Float64",
"sensitivity": null,
"description": "",
"is_pii": false
},
{
"name": "compliance",
"data_type": "Float64",
"sensitivity": null,
"description": "",
"is_pii": false
},
{
"name": "availability",
"data_type": "Float64",
"sensitivity": null,
"description": "",
"is_pii": false
},
{
"name": "communications",
"data_type": "Utf8",
"sensitivity": null,
"description": "",
"is_pii": false
},
{
"name": "resume_text",
"data_type": "Utf8",
"sensitivity": null,
"description": "",
"is_pii": false
}
],
"lineage": {
"source_system": "csv",
"source_file": "ethereal_workers.csv",
"ingest_job": "ingest-1776396209399",
"ingest_timestamp": "2026-04-17T03:23:29.399966984Z",
"parent_datasets": []
},
"freshness": null,
"tags": [],
"row_count": 10000,
"last_embedded_at": null,
"embedding_stale_since": null,
"embedding_refresh_policy": null
}

Binary file not shown.

433
scripts/serve_imagegen.py Normal file
View File

@ -0,0 +1,433 @@
#!/usr/bin/env python3
"""Image generation service — proxies to ComfyUI API on :8188.
Serves on :3600. Submits workflow to ComfyUI, polls for completion, returns image.
Falls back to direct diffusers if ComfyUI is unavailable.
Features:
- Disk cache same prompt returns cached image instantly
- Negative prompt for quality (no faces, hands, text)
- DreamShaper XL Turbo for high-quality editorial illustrations
"""
import base64
import hashlib
import io
import json
import os
import random
import time
import urllib.request
import urllib.error
from http.server import HTTPServer, BaseHTTPRequestHandler
from pathlib import Path
PORT = int(os.environ.get("IMAGEGEN_PORT", "3600"))
COMFYUI_URL = os.environ.get("COMFYUI_URL", "http://localhost:8188")
CACHE_DIR = Path(os.environ.get("IMAGEGEN_CACHE", "./data/_imagecache"))
CACHE_DIR.mkdir(parents=True, exist_ok=True)
WORKFLOW_PATH = "/opt/ComfyUI/workflows/editorial_hero.json"
def _cache_key(prompt, width, height, steps):
return hashlib.sha256(f"{prompt}|{width}|{height}|{steps}".encode()).hexdigest()[:24]
def _cache_get(key):
fp = CACHE_DIR / f"{key}.webp"
return base64.b64encode(fp.read_bytes()).decode() if fp.exists() else None
def _cache_put(key, img_bytes):
(CACHE_DIR / f"{key}.webp").write_bytes(img_bytes)
def _comfyui_generate(prompt, width=1024, height=512, steps=8, seed=None):
"""Submit workflow to ComfyUI and wait for result."""
# Load workflow template
with open(WORKFLOW_PATH) as f:
workflow = json.load(f)
# Customize
if seed is None:
seed = random.randint(0, 2**32)
workflow["3"]["inputs"]["seed"] = seed
workflow["3"]["inputs"]["steps"] = steps
workflow["5"]["inputs"]["width"] = width
workflow["5"]["inputs"]["height"] = height
workflow["6"]["inputs"]["text"] = prompt
# Submit to ComfyUI
payload = json.dumps({"prompt": workflow}).encode()
req = urllib.request.Request(
f"{COMFYUI_URL}/prompt",
data=payload,
headers={"Content-Type": "application/json"}
)
resp = urllib.request.urlopen(req, timeout=10)
result = json.loads(resp.read())
prompt_id = result["prompt_id"]
# Poll for completion
for _ in range(120): # up to 2 minutes
time.sleep(0.5)
try:
status_req = urllib.request.Request(f"{COMFYUI_URL}/history/{prompt_id}")
status_resp = urllib.request.urlopen(status_req, timeout=5)
history = json.loads(status_resp.read())
if prompt_id in history:
outputs = history[prompt_id].get("outputs", {})
# Find the SaveImage node output
for node_id, node_out in outputs.items():
images = node_out.get("images", [])
if images:
img_info = images[0]
# Fetch the image
img_url = f"{COMFYUI_URL}/view?filename={img_info['filename']}&subfolder={img_info.get('subfolder', '')}&type={img_info.get('type', 'output')}"
img_resp = urllib.request.urlopen(img_url, timeout=10)
img_data = img_resp.read()
# Convert to webp
from PIL import Image
img = Image.open(io.BytesIO(img_data))
buf = io.BytesIO()
img.save(buf, format="WEBP", quality=90)
return buf.getvalue(), seed
return None, seed # completed but no images
except Exception:
continue
return None, seed # timeout
def _diffusers_fallback(prompt, width, height, steps, seed):
"""Fallback: use raw diffusers SDXL Turbo if ComfyUI is down."""
import torch
from diffusers import AutoPipelineForText2Image
pipe = AutoPipelineForText2Image.from_pretrained(
"stabilityai/sdxl-turbo", torch_dtype=torch.float16, variant="fp16"
).to("cuda")
pipe.enable_attention_slicing()
if seed is None:
seed = random.randint(0, 2**32)
gen = torch.Generator("cuda").manual_seed(seed)
result = pipe(prompt=prompt, num_inference_steps=steps, guidance_scale=0.0,
width=width, height=height, generator=gen)
buf = io.BytesIO()
result.images[0].save(buf, format="WEBP", quality=90)
del pipe
torch.cuda.empty_cache()
return buf.getvalue(), seed
class ImageHandler(BaseHTTPRequestHandler):
def log_message(self, fmt, *args): pass
def _json(self, code, data):
self.send_response(code)
self.send_header("Content-Type", "application/json")
self.send_header("Access-Control-Allow-Origin", "*")
self.end_headers()
self.wfile.write(json.dumps(data).encode())
def do_OPTIONS(self):
self.send_response(200)
self.send_header("Access-Control-Allow-Origin", "*")
self.send_header("Access-Control-Allow-Methods", "POST, GET, OPTIONS")
self.send_header("Access-Control-Allow-Headers", "Content-Type")
self.end_headers()
def do_GET(self):
if self.path == "/health":
comfy_ok = False
try:
r = urllib.request.urlopen(f"{COMFYUI_URL}/system_stats", timeout=3)
comfy_ok = r.status == 200
except: pass
cached = len(list(CACHE_DIR.glob("*.webp")))
self._json(200, {"status": "ok", "comfyui": comfy_ok, "cached_images": cached})
elif self.path == "/cache/stats":
files = list(CACHE_DIR.glob("*.webp"))
self._json(200, {"count": len(files), "total_mb": round(sum(f.stat().st_size for f in files)/1024**2, 1)})
else:
self._json(404, {"error": "not found"})
def do_POST(self):
if self.path == "/generate":
self._generate()
elif self.path == "/blender":
self._blender_render()
elif self.path == "/img-to-3d":
self._img_to_3d()
elif self.path == "/scene-glb":
self._scene_glb()
else:
self._json(404, {"error": "not found"})
def _generate(self):
try:
length = int(self.headers.get("Content-Length", 0))
body = json.loads(self.rfile.read(length)) if length else {}
except:
self._json(400, {"error": "invalid JSON"}); return
prompt = body.get("prompt", "").strip()
if not prompt:
self._json(400, {"error": "prompt required"}); return
width = min(max(int(body.get("width", 1280)), 256), 1920)
height = min(max(int(body.get("height", 720)), 256), 1080)
steps = min(max(int(body.get("steps", 50)), 1), 80)
seed = body.get("seed")
# Cache check
key = _cache_key(prompt, width, height, steps)
cached = _cache_get(key)
if cached:
self._json(200, {"image": cached, "format": "webp", "width": width, "height": height,
"cached": True, "prompt": prompt[:200]}); return
t0 = time.time()
img_bytes = None
# Try ComfyUI first
try:
comfy_check = urllib.request.urlopen(f"{COMFYUI_URL}/system_stats", timeout=3)
if comfy_check.status == 200:
img_bytes, seed = _comfyui_generate(prompt, width, height, steps, seed)
backend = "comfyui"
except:
pass
# Fallback to diffusers
if not img_bytes:
try:
img_bytes, seed = _diffusers_fallback(prompt, width, height, steps, seed)
backend = "diffusers"
except Exception as e:
self._json(500, {"error": str(e)[:300]}); return
if not img_bytes:
self._json(500, {"error": "generation failed"}); return
elapsed_ms = int((time.time() - t0) * 1000)
img_b64 = base64.b64encode(img_bytes).decode()
_cache_put(key, img_bytes)
self._json(200, {
"image": img_b64, "format": "webp", "width": width, "height": height,
"steps": steps, "seed": seed, "time_ms": elapsed_ms,
"backend": backend, "prompt": prompt[:200], "cached": False,
})
print(f"[IMAGEGEN] {backend} {width}x{height} in {elapsed_ms}ms")
def _blender_render(self):
"""Render a 3D hero banner via Blender Cycles GPU."""
try:
length = int(self.headers.get("Content-Length", 0))
body = json.loads(self.rfile.read(length)) if length else {}
except:
self._json(400, {"error": "invalid JSON"}); return
seed = body.get("seed", random.randint(0, 99999))
# Cache check
key = f"blender-{seed}"
cached = _cache_get(key)
if cached:
self._json(200, {"image": cached, "format": "webp", "backend": "blender-cached",
"cached": True, "seed": seed}); return
t0 = time.time()
output_png = f"/tmp/blender_render_{seed}.png"
script = "/opt/ComfyUI/blender_scripts/hero_cycles.py"
try:
import subprocess
result = subprocess.run(
["blender", "--background", "--python", script, "--", str(seed), output_png],
capture_output=True, text=True, timeout=300
)
if not os.path.exists(output_png):
self._json(500, {"error": "Blender render failed: " + result.stderr[-300:] if result.stderr else "no output"}); return
# Convert to webp
from PIL import Image
img = Image.open(output_png)
buf = io.BytesIO()
img.save(buf, format="WEBP", quality=92)
img_bytes = buf.getvalue()
os.remove(output_png)
elapsed_ms = int((time.time() - t0) * 1000)
img_b64 = base64.b64encode(img_bytes).decode()
_cache_put(key, img_bytes)
self._json(200, {
"image": img_b64, "format": "webp", "width": 1280, "height": 320,
"seed": seed, "time_ms": elapsed_ms, "backend": "blender-cycles",
"cached": False,
})
print(f"[BLENDER] Rendered seed={seed} in {elapsed_ms}ms")
except Exception as e:
self._json(500, {"error": str(e)[:300]})
def _img_to_3d(self):
"""Full pipeline: AI image → 3D displacement → Blender render."""
try:
length = int(self.headers.get("Content-Length", 0))
body = json.loads(self.rfile.read(length)) if length else {}
except:
self._json(400, {"error": "invalid JSON"}); return
prompt = body.get("prompt", "abstract flowing golden energy, fractal patterns, dark background, sharp detail").strip()
seed = body.get("seed", random.randint(0, 99999))
key = f"img3d-{hashlib.sha256(prompt.encode()).hexdigest()[:12]}-{seed % 4}"
cached = _cache_get(key)
if cached:
self._json(200, {"image": cached, "format": "webp", "backend": "img3d-cached", "cached": True}); return
t0 = time.time()
try:
import subprocess
# Step 1: Generate source image via ComfyUI directly
src_path = f"/tmp/img3d_src_{seed}.png"
try:
img_bytes_src, _ = _comfyui_generate(prompt, 512, 512, 25, seed)
if not img_bytes_src:
# Fallback to diffusers
img_bytes_src, _ = _diffusers_fallback(prompt, 512, 512, 8, seed)
if not img_bytes_src:
self._json(500, {"error": "Failed to generate source image"}); return
from PIL import Image
img_src = Image.open(io.BytesIO(img_bytes_src))
img_src.save(src_path, "PNG")
except Exception as e:
self._json(500, {"error": f"Source image failed: {e}"}); return
# Step 2: TripoSR — convert image to 3D mesh
mesh_path = f"/tmp/triposr_mesh_{seed}.obj"
out_path = f"/tmp/img3d_out_{seed}.png"
try:
# Free VRAM for TripoSR
subprocess.run(["systemctl", "stop", "comfyui"], capture_output=True, timeout=10)
time.sleep(3)
triposr_script = f"""
import torch, sys
sys.path.insert(0, '/opt/TripoSR')
from PIL import Image
from tsr.system import TSR
model = TSR.from_pretrained('stabilityai/TripoSR', config_name='config.yaml', weight_name='model.ckpt')
model.to('cuda')
image = Image.open('{src_path}').convert('RGB')
with torch.no_grad():
scene_codes = model([image], device='cuda')
meshes = model.extract_mesh(scene_codes, has_vertex_color=True, resolution=128)
meshes[0].export('{mesh_path}')
del model; torch.cuda.empty_cache()
print('[TRIPOSR] mesh exported')
"""
result = subprocess.run(
["python3", "-c", triposr_script],
capture_output=True, text=True, timeout=120)
if not os.path.exists(mesh_path):
# Fallback to displacement approach
print(f"[IMG2-3D] TripoSR failed, falling back to displacement: {result.stderr[-200:]}")
script = "/opt/ComfyUI/blender_scripts/image_to_3d.py"
result = subprocess.run(
["blender", "--background", "--python", script, "--", src_path, out_path, str(seed)],
capture_output=True, text=True, timeout=120)
else:
# Step 3: Render the TripoSR mesh in Blender with gold materials
script = "/opt/ComfyUI/blender_scripts/triposr_render.py"
result = subprocess.run(
["blender", "--background", "--python", script, "--", mesh_path, out_path, str(seed)],
capture_output=True, text=True, timeout=120)
try: os.remove(mesh_path)
except: pass
finally:
# Always restart ComfyUI
subprocess.run(["systemctl", "start", "comfyui"], capture_output=True, timeout=10)
if not os.path.exists(out_path):
self._json(500, {"error": "Blender 3D render failed"}); return
from PIL import Image
img = Image.open(out_path)
buf = io.BytesIO()
img.save(buf, format="WEBP", quality=92)
img_bytes = buf.getvalue()
# Cleanup temp files
for f in [src_path, out_path]:
try: os.remove(f)
except: pass
elapsed = int((time.time() - t0) * 1000)
img_b64 = base64.b64encode(img_bytes).decode()
_cache_put(key, img_bytes)
self._json(200, {
"image": img_b64, "format": "webp", "width": 1280, "height": 320,
"seed": seed, "time_ms": elapsed, "backend": "img-to-3d", "cached": False,
})
print(f"[IMG2-3D] seed={seed} prompt={prompt[:50]} in {elapsed}ms")
except Exception as e:
self._json(500, {"error": str(e)[:300]})
def _scene_glb(self):
"""Generate a 3D scene and export as GLB for Three.js viewer."""
try:
length = int(self.headers.get("Content-Length", 0))
body = json.loads(self.rfile.read(length)) if length else {}
except:
self._json(400, {"error": "invalid JSON"}); return
seed = body.get("seed", random.randint(0, 99999))
key = f"glb-{seed}"
# Check cache
glb_cache = CACHE_DIR / f"{key}.glb"
if glb_cache.exists():
glb_b64 = base64.b64encode(glb_cache.read_bytes()).decode()
self._json(200, {"glb": glb_b64, "seed": seed, "cached": True})
return
t0 = time.time()
glb_path = f"/tmp/scene_{seed}.glb"
try:
import subprocess
result = subprocess.run(
["blender", "--background", "--python", "/opt/ComfyUI/blender_scripts/export_glb.py",
"--", str(seed), glb_path],
capture_output=True, text=True, timeout=120)
if not os.path.exists(glb_path):
self._json(500, {"error": "GLB export failed: " + result.stderr[-200:] if result.stderr else "no output"})
return
glb_bytes = open(glb_path, 'rb').read()
os.remove(glb_path)
glb_cache.write_bytes(glb_bytes)
glb_b64 = base64.b64encode(glb_bytes).decode()
elapsed = int((time.time() - t0) * 1000)
self._json(200, {"glb": glb_b64, "seed": seed, "time_ms": elapsed, "cached": False})
print(f"[GLB] seed={seed} in {elapsed}ms size={len(glb_bytes)//1024}KB")
except Exception as e:
self._json(500, {"error": str(e)[:300]})
if __name__ == "__main__":
print(f"[IMAGEGEN] Starting on port {PORT}")
print(f"[IMAGEGEN] ComfyUI backend: {COMFYUI_URL}")
print(f"[IMAGEGEN] Cache: {CACHE_DIR}")
HTTPServer(("0.0.0.0", PORT), ImageHandler).serve_forever()

369
scripts/serve_lab.py Normal file
View File

@ -0,0 +1,369 @@
#!/usr/bin/env python3
"""Pipeline Lab server — serves notebook UI on :3500, proxies API to sidecar :3200."""
import http.server
import json
import urllib.request
import urllib.error
PORT = 3500
SIDECAR = "http://localhost:3200"
class LabHandler(http.server.BaseHTTPRequestHandler):
def log_message(self, fmt, *args):
pass # quiet
def do_GET(self):
if self.path == "/" or self.path == "":
self._serve_ui()
elif self.path.startswith("/lab/"):
self._proxy("GET")
else:
self.send_error(404)
def do_POST(self):
if self.path.startswith("/lab/"):
self._proxy("POST")
else:
self.send_error(404)
def do_DELETE(self):
if self.path.startswith("/lab/"):
self._proxy("DELETE")
else:
self.send_error(404)
def _proxy(self, method):
"""Proxy request to sidecar."""
url = SIDECAR + self.path
body = None
if method in ("POST", "PUT"):
length = int(self.headers.get("Content-Length", 0))
body = self.rfile.read(length) if length else None
req = urllib.request.Request(url, data=body, method=method)
req.add_header("Content-Type", self.headers.get("Content-Type", "application/json"))
try:
with urllib.request.urlopen(req, timeout=120) as resp:
data = resp.read()
self.send_response(resp.status)
self.send_header("Content-Type", resp.headers.get("Content-Type", "application/json"))
self.send_header("Access-Control-Allow-Origin", "*")
self.end_headers()
self.wfile.write(data)
except urllib.error.HTTPError as e:
self.send_response(e.code)
self.send_header("Content-Type", "application/json")
self.end_headers()
self.wfile.write(e.read())
except Exception as e:
self.send_response(502)
self.send_header("Content-Type", "application/json")
self.end_headers()
self.wfile.write(json.dumps({"error": str(e)}).encode())
def _serve_ui(self):
self.send_response(200)
self.send_header("Content-Type", "text/html; charset=utf-8")
self.end_headers()
self.wfile.write(HTML.encode())
HTML = r"""<!DOCTYPE html>
<html lang="en"><head>
<meta charset="UTF-8"><meta name="viewport" content="width=device-width,initial-scale=1.0">
<title>Pipeline Lab &#x2014; Lakehouse</title>
<style>
:root{--bg:#08090c;--surface:rgba(14,16,22,0.9);--border:#2a2d35;--text:#e8e6e3;--text2:#7a7872;--accent:#4ade80;--gold:#e2b55a;--red:#e05252;--blue:#5b9cf5;--purple:#c084fc}
*{box-sizing:border-box;margin:0;padding:0}
body{font-family:'SF Mono','Menlo','Consolas',monospace;background:var(--bg);color:var(--text);min-height:100vh;padding:20px 28px;font-size:13px}
h1{font-size:18px;font-weight:700;margin-bottom:4px}h1 span{color:var(--accent)}
.subtitle{color:var(--text2);font-size:11px;margin-bottom:20px}
.nav{display:flex;gap:8px;margin-bottom:16px;font-size:10px}
.nav a{color:var(--text2);text-decoration:none;padding:4px 10px;border:1px solid var(--border);border-radius:3px}
.nav a:hover{border-color:var(--accent);color:var(--accent)}
.cells{display:flex;flex-direction:column;gap:12px;max-width:1100px}
.cell{background:var(--surface);border:1px solid var(--border);border-radius:4px;overflow:hidden}
.cell.running{border-color:var(--gold);animation:pulse 1.5s infinite}
@keyframes pulse{0%,100%{border-color:var(--gold)}50%{border-color:var(--border)}}
.cell-header{display:flex;align-items:center;gap:8px;padding:8px 12px;border-bottom:1px solid var(--border);font-size:10px;text-transform:uppercase;letter-spacing:1px;color:var(--text2)}
.cell-type{font-weight:700}
.cell-time{margin-left:auto}
.cell-input{padding:12px;background:rgba(0,0,0,0.3)}
.cell-input textarea{width:100%;min-height:60px;background:transparent;border:none;color:var(--text);font-family:inherit;font-size:13px;resize:vertical;outline:none;line-height:1.6}
.cell-output{padding:12px;font-size:12px;line-height:1.6;white-space:pre-wrap;max-height:500px;overflow-y:auto;display:none}
.cell-output.has-data{display:block;border-top:1px solid var(--border)}
.toolbar{display:flex;gap:6px;padding:8px 12px;border-top:1px solid var(--border);flex-wrap:wrap}
.btn{font-family:inherit;font-size:10px;text-transform:uppercase;letter-spacing:0.5px;padding:5px 12px;border:1px solid var(--border);border-radius:3px;background:transparent;color:var(--text2);cursor:pointer;transition:all 0.15s}
.btn:hover{border-color:var(--accent);color:var(--accent)}
.btn.primary{border-color:var(--accent);color:var(--accent);background:rgba(74,222,128,0.06)}
.btn.gold{border-color:var(--gold);color:var(--gold)}
.btn.blue{border-color:var(--blue);color:var(--blue)}
.btn.purple{border-color:var(--purple);color:var(--purple)}
.btn.red{border-color:var(--red);color:var(--red)}
.top-bar{display:flex;gap:8px;margin-bottom:16px;align-items:center;flex-wrap:wrap}
.status-bar{display:flex;gap:12px;padding:8px 12px;background:var(--surface);border:1px solid var(--border);border-radius:4px;margin-bottom:16px;font-size:10px;color:var(--text2)}
.stat{display:flex;align-items:center;gap:4px}.stat b{color:var(--text)}
.result-row{display:flex;gap:8px;padding:6px 8px;border-bottom:1px solid rgba(42,45,53,0.3);align-items:center;font-size:11px}
.result-row:last-child{border-bottom:none}
.score-bar{width:60px;height:5px;background:rgba(0,0,0,0.2);border-radius:3px;overflow:hidden}
.score-fill{height:100%;border-radius:3px}
.threshold-slider{display:flex;align-items:center;gap:8px;padding:0 12px;margin:4px 0}
.threshold-slider input[type=range]{flex:1;accent-color:var(--accent)}
.threshold-slider .val{font-weight:700;min-width:36px;text-align:right}
</style></head><body>
<h1><span>Pipeline Lab</span> // Lakehouse</h1>
<div class="subtitle">Embedding-based screening vs LLM classification &#x2014; iterative experimentation</div>
<div class="nav">
<a href="http://192.168.1.177:3100">Gateway :3100</a>
<a href="http://192.168.1.177:3300">UI :3300</a>
<a href="http://192.168.1.177:3400">Observer :3400</a>
</div>
<div class="status-bar" id="status-bar">
<div class="stat"><span>Exemplars:</span> <b id="st-exemplars">0</b></div>
<div class="stat"><span>Categories:</span> <b id="st-categories">0</b></div>
<div class="stat"><span>Pipelines:</span> <b id="st-pipelines">0</b></div>
<div class="stat" style="margin-left:auto"><span>Sidecar:</span> <b id="st-health" style="color:var(--text2)">...</b></div>
</div>
<div class="top-bar">
<button class="btn primary" onclick="addCell('exemplars')">+ Exemplars</button>
<button class="btn gold" onclick="addCell('screen')">+ Screen</button>
<button class="btn blue" onclick="addCell('classify')">+ Classify</button>
<button class="btn purple" onclick="addCell('benchmark')">+ Benchmark</button>
<button class="btn" onclick="addCell('similarity')">+ Similarity</button>
<button class="btn" onclick="addCell('generate')">+ Generate</button>
<button class="btn" onclick="addCell('pipeline')">+ Pipeline</button>
<span style="flex:1"></span>
<button class="btn red" onclick="clearCells()">Clear All</button>
</div>
<div class="cells" id="cells"></div>
<script>
var cellCounter = 0;
function esc(t){var d=document.createElement('span');d.textContent=String(t);return d.innerHTML}
async function api(path, body) {
var opts = body ? {method:'POST', headers:{'Content-Type':'application/json'}, body:JSON.stringify(body)} : {};
var r = await fetch('/lab' + path, opts);
if (!r.ok) { var e = await r.json().catch(function(){return {error:'request failed'}}); throw new Error(e.error || r.statusText); }
return r.json();
}
async function refreshStatus() {
try {
var ex = await api('/exemplars');
var pl = await api('/pipelines');
document.getElementById('st-exemplars').textContent = ex.total || 0;
document.getElementById('st-categories').textContent = Object.keys(ex.categories || {}).length;
document.getElementById('st-pipelines').textContent = (pl.pipelines || []).length;
document.getElementById('st-health').textContent = 'connected';
document.getElementById('st-health').style.color = 'var(--accent)';
} catch(e) {
document.getElementById('st-health').textContent = 'error: ' + e.message;
document.getElementById('st-health').style.color = 'var(--red)';
}
}
var placeholders = {
exemplars:'Category: decision\n---\nWe decided to use Parquet for all storage\nThe team chose React over Vue\nArchitecture decision: microservices',
screen:'We decided to migrate to PostgreSQL\nThe weather is nice today\nArchitecture: chose event sourcing over CRUD\nLunch was great',
classify:'We decided to migrate to PostgreSQL\nThe weather is nice today\nArchitecture: chose event sourcing',
benchmark:'We decided to use Kubernetes for orchestration\nThe new hire starts Monday\nTechnical debt: refactor the auth module\nLunch menu looks good today',
similarity:'We chose React for the frontend\nReact was selected as our UI framework\nThe database uses PostgreSQL',
generate:'Enter a prompt for the LLM...',
pipeline:'Pipeline name: my-extraction\n---\nscreen | threshold=0.6\nclassify\nextract | prompt=Extract the key decision and rationale\nvalidate | dedup_threshold=0.9'
};
var colors = {exemplars:'var(--accent)',screen:'var(--gold)',classify:'var(--blue)',benchmark:'var(--purple)',similarity:'var(--text2)',generate:'var(--text2)',pipeline:'var(--accent)'};
var labels = {exemplars:'EXEMPLARS',screen:'SCREEN',classify:'CLASSIFY (LLM)',benchmark:'BENCHMARK A/B',similarity:'SIMILARITY',generate:'GENERATE',pipeline:'PIPELINE'};
function addCell(type) {
var id = 'cell-' + (++cellCounter);
var cells = document.getElementById('cells');
var cell = document.createElement('div'); cell.className = 'cell'; cell.id = id;
var header = document.createElement('div'); header.className = 'cell-header';
var typeSpan = document.createElement('span'); typeSpan.className = 'cell-type'; typeSpan.style.color = colors[type]||'var(--text2)'; typeSpan.textContent = labels[type]||type; header.appendChild(typeSpan);
var numSpan = document.createElement('span'); numSpan.textContent = 'Cell #' + cellCounter; header.appendChild(numSpan);
var timeSpan = document.createElement('span'); timeSpan.className = 'cell-time'; timeSpan.id = id+'-time'; header.appendChild(timeSpan);
cell.appendChild(header);
var inputDiv = document.createElement('div'); inputDiv.className = 'cell-input';
var ta = document.createElement('textarea'); ta.id = id+'-input'; ta.placeholder = placeholders[type]||''; ta.value = placeholders[type]||'';
inputDiv.appendChild(ta); cell.appendChild(inputDiv);
if (type === 'screen' || type === 'benchmark') {
var sl = document.createElement('div'); sl.className = 'threshold-slider';
var slLbl = document.createElement('span'); slLbl.style.cssText='font-size:10px;color:var(--text2)'; slLbl.textContent='Threshold:'; sl.appendChild(slLbl);
var range = document.createElement('input'); range.type='range'; range.min='0.3'; range.max='0.95'; range.step='0.05'; range.value='0.65'; range.id=id+'-threshold';
var valSpan = document.createElement('span'); valSpan.className='val'; valSpan.textContent='0.65';
range.oninput=function(){valSpan.textContent=this.value};
sl.appendChild(range); sl.appendChild(valSpan); cell.appendChild(sl);
}
var outputDiv = document.createElement('div'); outputDiv.className='cell-output'; outputDiv.id=id+'-output'; cell.appendChild(outputDiv);
var tb = document.createElement('div'); tb.className='toolbar';
var runBtn = document.createElement('button'); runBtn.className='btn primary'; runBtn.textContent='Run'; runBtn.onclick=function(){runCell(id,type)}; tb.appendChild(runBtn);
var rmBtn = document.createElement('button'); rmBtn.className='btn red'; rmBtn.textContent='Remove'; rmBtn.onclick=function(){cell.remove()}; tb.appendChild(rmBtn);
cell.appendChild(tb);
cells.appendChild(cell); ta.focus();
}
function clearCells(){document.getElementById('cells').textContent='';cellCounter=0}
function parseLines(text){return text.split('\n').map(function(l){return l.trim()}).filter(function(l){return l&&l.charAt(0)!=='#'})}
async function runCell(id, type) {
var cell = document.getElementById(id);
var input = document.getElementById(id+'-input').value;
var output = document.getElementById(id+'-output');
var timeEl = document.getElementById(id+'-time');
cell.classList.add('running');
output.className='cell-output has-data'; output.textContent='Running...'; output.style.color='';
try {
var t0=performance.now(), result;
if (type==='exemplars') {
var parts=input.split('---'); var catLine=(parts[0]||'').trim();
var category=catLine.replace(/^category:\s*/i,'').trim().toLowerCase();
var texts=parseLines(parts.slice(1).join('\n'));
if(!category||!texts.length){output.textContent='Format: Category: name\\n---\\ntext1\\ntext2';return}
result=await api('/exemplars',{category:category,texts:texts});
output.textContent='Added '+result.added+' exemplars to "'+result.category+'" (total: '+result.total+')';
output.style.color='var(--accent)'; refreshStatus();
}
else if (type==='screen') {
var texts=parseLines(input);
var threshold=parseFloat((document.getElementById(id+'-threshold')||{}).value||'0.65');
result=await api('/screen',{texts:texts,threshold:threshold});
renderScreen(output,result,threshold);
}
else if (type==='classify') {
var texts=parseLines(input); result=await api('/classify',{texts:texts});
renderClassify(output,result);
}
else if (type==='benchmark') {
var texts=parseLines(input);
var threshold=parseFloat((document.getElementById(id+'-threshold')||{}).value||'0.65');
result=await api('/benchmark',{texts:texts,threshold:threshold});
renderBenchmark(output,result);
}
else if (type==='similarity') {
var texts=parseLines(input); result=await api('/cell',{action:'similarity',texts:texts});
renderMatrix(output,result);
}
else if (type==='generate') {
result=await api('/cell',{action:'generate',text:input});
output.textContent=result.text||'(empty)';
}
else if (type==='pipeline') {
var parts=input.split('---'); var pName=(parts[0]||'').trim().replace(/^pipeline\s*name:\s*/i,'').trim();
var stageLines=parseLines(parts.slice(1).join('\n'));
var stages=stageLines.map(function(line){
var ps=line.split('|').map(function(s){return s.trim()});var config={};
ps.slice(1).forEach(function(p){var kv=p.split('=');if(kv.length===2){var v=kv[1].trim();config[kv[0].trim()]=isNaN(parseFloat(v))?v:parseFloat(v)}});
return{name:ps[0],mode:ps[0],config:config};
});
await api('/pipelines',{name:pName,stages:stages,description:'Pipeline Lab'});
output.textContent='Pipeline "'+pName+'" saved ('+stages.length+' stages)';
output.style.color='var(--accent)'; refreshStatus();
}
timeEl.textContent=Math.round(performance.now()-t0)+'ms'+(result&&result.time_ms?' (server: '+result.time_ms+'ms)':'');
} catch(e){output.textContent='Error: '+e.message;output.style.color='var(--red)'} finally{cell.classList.remove('running')}
}
function renderScreen(el,results,threshold){
el.textContent='';
results.forEach(function(r){
var row=document.createElement('div');row.className='result-row';
var cat=document.createElement('span');cat.style.cssText='min-width:90px;font-weight:700;color:'+(r.above_threshold?'var(--accent)':'var(--text2)');cat.textContent=r.best_category||'none';row.appendChild(cat);
var sim=document.createElement('span');sim.style.cssText='min-width:50px;font-weight:700;color:'+(r.similarity>=0.7?'var(--accent)':r.similarity>=threshold?'var(--gold)':'var(--text2)');sim.textContent=(r.similarity*100).toFixed(1)+'%';row.appendChild(sim);
var bar=document.createElement('div');bar.className='score-bar';var fill=document.createElement('div');fill.className='score-fill';fill.style.width=(r.similarity*100)+'%';fill.style.background=r.similarity>=0.7?'var(--accent)':r.similarity>=threshold?'var(--gold)':'var(--red)';bar.appendChild(fill);row.appendChild(bar);
var txt=document.createElement('span');txt.style.cssText='flex:1;overflow:hidden;text-overflow:ellipsis;white-space:nowrap';txt.textContent=r.text;row.appendChild(txt);
var badge=document.createElement('span');badge.style.cssText='font-size:9px;padding:2px 6px;border-radius:2px;border:1px solid;'+(r.above_threshold?'color:var(--accent);border-color:var(--accent)':'color:var(--text2);border-color:var(--border)');badge.textContent=r.above_threshold?'PASS':'FILTERED';row.appendChild(badge);
el.appendChild(row);
});
}
function renderClassify(el,results){
el.textContent='';
results.forEach(function(r){
var row=document.createElement('div');row.className='result-row';
var cat=document.createElement('span');cat.style.cssText='min-width:90px;font-weight:700;color:var(--blue)';cat.textContent=r.category;row.appendChild(cat);
var conf=document.createElement('span');conf.style.cssText='min-width:50px;font-size:10px;color:'+(r.confidence==='high'?'var(--accent)':r.confidence==='medium'?'var(--gold)':'var(--text2)');conf.textContent=r.confidence;row.appendChild(conf);
var txt=document.createElement('span');txt.style.flex='1';txt.textContent=r.text;row.appendChild(txt);
el.appendChild(row);
});
}
function renderBenchmark(el,result){
el.textContent='';
var summary=document.createElement('div');summary.style.cssText='display:flex;gap:12px;margin-bottom:12px;flex-wrap:wrap';
[['Agreement',(result.agreement_rate*100).toFixed(1)+'%',result.agreement_rate>=0.8?'var(--accent)':'var(--gold)'],
['Speedup',result.speedup+'x',result.speedup>=2?'var(--accent)':'var(--text)'],
['Embed',result.embed_time_ms+'ms','var(--gold)'],
['LLM',result.llm_time_ms+'ms','var(--blue)'],
['Hybrid',result.hybrid_estimated_ms+'ms','var(--accent)'],
['Screened',result.texts_screened_out+'/'+result.total_texts,'var(--purple)']
].forEach(function(s){
var box=document.createElement('div');box.style.cssText='background:rgba(0,0,0,0.2);padding:8px 12px;border-radius:3px;text-align:center';
var lbl=document.createElement('div');lbl.style.cssText='font-size:9px;color:var(--text2);text-transform:uppercase;letter-spacing:0.5px';lbl.textContent=s[0];box.appendChild(lbl);
var val=document.createElement('div');val.style.cssText='font-size:18px;font-weight:700;color:'+s[2];val.textContent=s[1];box.appendChild(val);
summary.appendChild(box);
});
el.appendChild(summary);
// Side by side
var grid=document.createElement('div');grid.style.cssText='display:grid;grid-template-columns:1fr 1fr;gap:12px';
[[result.embed_results||[],'EMBEDDING ('+result.embed_time_ms+'ms)','var(--gold)',true],
[result.llm_results||[],'LLM ('+result.llm_time_ms+'ms)','var(--blue)',false]
].forEach(function(col){
var div=document.createElement('div');div.style.cssText='background:rgba(0,0,0,0.2);border-radius:3px;padding:10px';
var title=document.createElement('div');title.style.cssText='font-size:10px;text-transform:uppercase;letter-spacing:1px;margin-bottom:6px;font-weight:700;color:'+col[2];title.textContent=col[1];div.appendChild(title);
col[0].forEach(function(r){
var row=document.createElement('div');row.style.cssText='font-size:11px;padding:3px 0;display:flex;gap:6px;align-items:center';
var c=document.createElement('span');c.style.cssText='min-width:70px;font-weight:700;color:'+(col[3]?(r.above_threshold?'var(--accent)':'var(--text2)'):'var(--blue)');c.textContent=(col[3]?r.best_category:r.category)||'none';row.appendChild(c);
var s=document.createElement('span');s.style.cssText='min-width:40px;color:var(--text2)';s.textContent=col[3]?(r.similarity*100).toFixed(0)+'%':r.confidence;row.appendChild(s);
var t=document.createElement('span');t.style.cssText='flex:1;overflow:hidden;text-overflow:ellipsis;white-space:nowrap';t.textContent=r.text;row.appendChild(t);
div.appendChild(row);
});
grid.appendChild(div);
});
el.appendChild(grid);
}
function renderMatrix(el,result){
el.textContent='';
var matrix=result.matrix||[],texts=result.texts||[];
if(!matrix.length){el.textContent='No results';return}
var tbl=document.createElement('table');tbl.style.cssText='border-collapse:collapse;font-size:11px;width:100%';
var hdr=document.createElement('tr');var corner=document.createElement('th');hdr.appendChild(corner);
texts.forEach(function(t){var th=document.createElement('th');th.style.cssText='padding:4px;color:var(--text2);font-size:9px;max-width:100px;overflow:hidden;text-overflow:ellipsis;white-space:nowrap';th.textContent=t.substring(0,20);th.title=t;hdr.appendChild(th)});
tbl.appendChild(hdr);
matrix.forEach(function(row,i){
var tr=document.createElement('tr');
var td0=document.createElement('td');td0.style.cssText='padding:4px;color:var(--text2);font-size:9px';td0.textContent=texts[i].substring(0,20);tr.appendChild(td0);
row.forEach(function(v,j){
var td=document.createElement('td');
td.style.cssText='padding:4px;text-align:center;font-weight:'+(v>=0.7?'700':'400')+';color:'+(v>=0.8?'var(--accent)':v>=0.6?'var(--gold)':'var(--text2)')+';background:'+(i===j?'rgba(74,222,128,0.1)':v>=0.8?'rgba(74,222,128,0.15)':v>=0.6?'rgba(226,181,90,0.1)':'transparent');
td.textContent=v.toFixed(2);tr.appendChild(td);
});
tbl.appendChild(tr);
});
el.appendChild(tbl);
}
refreshStatus();
</script>
</body></html>"""
if __name__ == "__main__":
print(f"Pipeline Lab running on http://0.0.0.0:{PORT}")
server = http.server.HTTPServer(("0.0.0.0", PORT), LabHandler)
server.serve_forever()