Surfaced by today's untracked-files audit. None of these are accidents —
multiple are referenced by name in CLAUDE.md and memory files but were
never added.
Categories:
- docs/PHASE_AUDIT_GUIDE.md (106 LOC) — Claude Code phase audit guidance
- ops/systemd/lakehouse-langfuse-bridge.service — Langfuse bridge unit
- package.json — top-level npm manifest
- scripts/e2e_pipeline_check.sh + production_smoke.sh — real test scripts
- reports/kimi/audit-last-week*.md — the "Two reports live" CLAUDE.md cites
- tests/multi-agent/scenarios/ — 44 staffing scenarios (cutover decision A)
- tests/multi-agent/playbooks/ — 102 playbook records
- tests/battery/, tests/agent_test/PRD.md, tests/real-world/* — real tests
- sidecar/sidecar/{lab_ui,pipeline_lab}.py — 888 LOC dev-only UIs that
remain in service post-sidecar-drop (commit ba928b1 explicitly kept them)
Sensitivity check: scenarios use synthetic company names ("Heritage Foods",
"Cornerstone Fabrication"); audit reports describe code findings only;
no PII or secrets surfaced.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
504 lines
17 KiB
Python
504 lines
17 KiB
Python
"""Pipeline Lab — iterative embedding/LLM pipeline experimentation.
|
|
|
|
Provides:
|
|
- Exemplar-based embedding classification (fast screening)
|
|
- LLM-based classification (accurate but slow)
|
|
- A/B benchmarking between the two
|
|
- Pipeline definition and execution
|
|
- Notebook-style API for interactive experimentation
|
|
"""
|
|
|
|
import json
|
|
import math
|
|
import os
|
|
import time
|
|
from pathlib import Path
|
|
from typing import Optional
|
|
|
|
from fastapi import APIRouter, HTTPException
|
|
from fastapi.responses import HTMLResponse
|
|
from pydantic import BaseModel
|
|
|
|
from .ollama import client
|
|
|
|
router = APIRouter()
|
|
|
|
EMBED_MODEL = os.environ.get("EMBED_MODEL", "nomic-embed-text")
|
|
GEN_MODEL = os.environ.get("GEN_MODEL", "qwen2.5")
|
|
LAB_DIR = Path(os.environ.get("LAB_DIR", "./data/_pipeline_lab"))
|
|
LAB_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
|
# ─── Vector math ─────────────────────────────────────────────
|
|
|
|
def cosine_similarity(a: list[float], b: list[float]) -> float:
|
|
dot = sum(x * y for x, y in zip(a, b))
|
|
norm_a = math.sqrt(sum(x * x for x in a))
|
|
norm_b = math.sqrt(sum(x * x for x in b))
|
|
if norm_a == 0 or norm_b == 0:
|
|
return 0.0
|
|
return dot / (norm_a * norm_b)
|
|
|
|
|
|
# ─── Exemplar store ──────────────────────────────────────────
|
|
# Exemplars are labeled text+embedding pairs used for classification.
|
|
# e.g. category="decision" texts=["We decided to use Parquet", "The team chose React"]
|
|
|
|
_exemplars: dict[str, list[dict]] = {} # category -> [{text, embedding}]
|
|
|
|
|
|
def _exemplar_file() -> Path:
|
|
return LAB_DIR / "exemplars.json"
|
|
|
|
|
|
def _load_exemplars():
|
|
global _exemplars
|
|
fp = _exemplar_file()
|
|
if fp.exists():
|
|
data = json.loads(fp.read_text())
|
|
_exemplars = data
|
|
return _exemplars
|
|
|
|
|
|
def _save_exemplars():
|
|
_exemplar_file().write_text(json.dumps(_exemplars, indent=2))
|
|
|
|
|
|
_load_exemplars()
|
|
|
|
|
|
# ─── Pipeline store ──────────────────────────────────────────
|
|
|
|
def _pipelines_dir() -> Path:
|
|
d = LAB_DIR / "pipelines"
|
|
d.mkdir(exist_ok=True)
|
|
return d
|
|
|
|
|
|
# ─── Embedding helper ────────────────────────────────────────
|
|
|
|
async def _embed_texts(texts: list[str], model: str = EMBED_MODEL) -> list[list[float]]:
|
|
embeddings = []
|
|
async with client() as c:
|
|
for text in texts:
|
|
resp = await c.post("/api/embed", json={"model": model, "input": text})
|
|
if resp.status_code != 200:
|
|
raise HTTPException(502, f"Ollama embed error: {resp.text}")
|
|
data = resp.json()
|
|
embeddings.extend(data.get("embeddings", []))
|
|
return embeddings
|
|
|
|
|
|
async def _generate(prompt: str, model: str = GEN_MODEL, temperature: float = 0.3) -> str:
|
|
async with client() as c:
|
|
resp = await c.post("/api/generate", json={
|
|
"model": model, "prompt": prompt, "stream": False,
|
|
"options": {"temperature": temperature, "num_predict": 1024}
|
|
})
|
|
if resp.status_code != 200:
|
|
raise HTTPException(502, f"Ollama generate error: {resp.text}")
|
|
return resp.json().get("response", "")
|
|
|
|
|
|
# ─── API: Exemplars ──────────────────────────────────────────
|
|
|
|
class ExemplarAdd(BaseModel):
|
|
category: str
|
|
texts: list[str]
|
|
|
|
|
|
class ExemplarList(BaseModel):
|
|
categories: dict[str, int] # category -> count
|
|
|
|
|
|
@router.post("/exemplars")
|
|
async def add_exemplars(req: ExemplarAdd):
|
|
"""Add labeled exemplar texts for a category. Embeddings generated automatically."""
|
|
category = req.category.strip().lower()
|
|
if not category or not req.texts:
|
|
raise HTTPException(400, "category and texts required")
|
|
|
|
embeddings = await _embed_texts(req.texts)
|
|
|
|
if category not in _exemplars:
|
|
_exemplars[category] = []
|
|
|
|
for text, emb in zip(req.texts, embeddings):
|
|
_exemplars[category].append({"text": text, "embedding": emb})
|
|
|
|
_save_exemplars()
|
|
return {"ok": True, "category": category, "added": len(req.texts),
|
|
"total": len(_exemplars[category])}
|
|
|
|
|
|
@router.get("/exemplars")
|
|
async def list_exemplars():
|
|
"""List all exemplar categories and counts."""
|
|
return {"categories": {k: len(v) for k, v in _exemplars.items()},
|
|
"total": sum(len(v) for v in _exemplars.values())}
|
|
|
|
|
|
@router.delete("/exemplars/{category}")
|
|
async def delete_exemplar_category(category: str):
|
|
if category in _exemplars:
|
|
del _exemplars[category]
|
|
_save_exemplars()
|
|
return {"ok": True}
|
|
|
|
|
|
# ─── API: Screen (embedding-based classification) ────────────
|
|
|
|
class ScreenRequest(BaseModel):
|
|
texts: list[str]
|
|
threshold: float = 0.65
|
|
top_k: int = 1
|
|
|
|
|
|
class ScreenResult(BaseModel):
|
|
text: str
|
|
best_category: str | None
|
|
similarity: float
|
|
above_threshold: bool
|
|
all_scores: dict[str, float]
|
|
|
|
|
|
@router.post("/screen", response_model=list[ScreenResult])
|
|
async def screen_texts(req: ScreenRequest):
|
|
"""Classify texts by cosine similarity to exemplar embeddings (fast path)."""
|
|
if not _exemplars:
|
|
raise HTTPException(400, "No exemplars defined. Add exemplars first.")
|
|
|
|
embeddings = await _embed_texts(req.texts)
|
|
results = []
|
|
|
|
for text, emb in zip(req.texts, embeddings):
|
|
category_scores = {}
|
|
for category, exemplar_list in _exemplars.items():
|
|
sims = [cosine_similarity(emb, ex["embedding"]) for ex in exemplar_list]
|
|
category_scores[category] = max(sims) if sims else 0.0
|
|
|
|
best_cat = max(category_scores, key=category_scores.get) if category_scores else None
|
|
best_sim = category_scores.get(best_cat, 0.0) if best_cat else 0.0
|
|
|
|
results.append(ScreenResult(
|
|
text=text[:200],
|
|
best_category=best_cat if best_sim >= req.threshold else None,
|
|
similarity=round(best_sim, 4),
|
|
above_threshold=best_sim >= req.threshold,
|
|
all_scores={k: round(v, 4) for k, v in sorted(category_scores.items(),
|
|
key=lambda x: x[1], reverse=True)},
|
|
))
|
|
|
|
return results
|
|
|
|
|
|
# ─── API: Classify (LLM-based classification) ────────────────
|
|
|
|
class ClassifyRequest(BaseModel):
|
|
texts: list[str]
|
|
categories: list[str] | None = None # if None, use exemplar category names
|
|
model: str | None = None
|
|
|
|
|
|
class ClassifyResult(BaseModel):
|
|
text: str
|
|
category: str
|
|
confidence: str
|
|
reasoning: str
|
|
|
|
|
|
@router.post("/classify", response_model=list[ClassifyResult])
|
|
async def classify_texts(req: ClassifyRequest):
|
|
"""Classify texts using LLM (slow but accurate path)."""
|
|
categories = req.categories or list(_exemplars.keys())
|
|
if not categories:
|
|
raise HTTPException(400, "No categories. Provide categories or add exemplars.")
|
|
|
|
model = req.model or GEN_MODEL
|
|
results = []
|
|
|
|
for text in req.texts:
|
|
prompt = (
|
|
f"Classify this text into exactly ONE of these categories: {', '.join(categories)}\n\n"
|
|
f"TEXT: {text[:500]}\n\n"
|
|
f"Respond with JSON: {{\"category\": \"...\", \"confidence\": \"high|medium|low\", "
|
|
f"\"reasoning\": \"one sentence\"}}"
|
|
)
|
|
raw = await _generate(prompt, model=model, temperature=0.1)
|
|
|
|
# Parse
|
|
try:
|
|
j_s, j_e = raw.find("{"), raw.rfind("}") + 1
|
|
parsed = json.loads(raw[j_s:j_e]) if j_s >= 0 and j_e > j_s else {}
|
|
except Exception:
|
|
parsed = {}
|
|
|
|
results.append(ClassifyResult(
|
|
text=text[:200],
|
|
category=parsed.get("category", "unknown"),
|
|
confidence=parsed.get("confidence", "low"),
|
|
reasoning=parsed.get("reasoning", raw[:200]),
|
|
))
|
|
|
|
return results
|
|
|
|
|
|
# ─── API: Benchmark (A/B comparison) ─────────────────────────
|
|
|
|
class BenchmarkRequest(BaseModel):
|
|
texts: list[str]
|
|
threshold: float = 0.65
|
|
model: str | None = None
|
|
|
|
|
|
class BenchmarkResult(BaseModel):
|
|
total_texts: int
|
|
# Embedding path
|
|
embed_time_ms: int
|
|
embed_results: list[dict]
|
|
# LLM path
|
|
llm_time_ms: int
|
|
llm_results: list[dict]
|
|
# Comparison
|
|
agreement_rate: float
|
|
speedup: float
|
|
texts_screened_out: int
|
|
texts_needing_llm: int
|
|
hybrid_estimated_ms: int
|
|
|
|
|
|
@router.post("/benchmark", response_model=BenchmarkResult)
|
|
async def benchmark(req: BenchmarkRequest):
|
|
"""Run same texts through embedding screening and LLM classification. Compare."""
|
|
if not _exemplars:
|
|
raise HTTPException(400, "No exemplars. Add exemplars first.")
|
|
|
|
categories = list(_exemplars.keys())
|
|
|
|
# Embedding path
|
|
t0 = time.monotonic()
|
|
embed_results = await screen_texts(ScreenRequest(
|
|
texts=req.texts, threshold=req.threshold
|
|
))
|
|
embed_ms = int((time.monotonic() - t0) * 1000)
|
|
|
|
# LLM path
|
|
t0 = time.monotonic()
|
|
llm_results = await classify_texts(ClassifyRequest(
|
|
texts=req.texts, categories=categories, model=req.model
|
|
))
|
|
llm_ms = int((time.monotonic() - t0) * 1000)
|
|
|
|
# Compare
|
|
agreements = 0
|
|
screened_out = 0
|
|
for er, lr in zip(embed_results, llm_results):
|
|
if not er.above_threshold:
|
|
screened_out += 1
|
|
if er.best_category == lr.category:
|
|
agreements += 1
|
|
|
|
needing_llm = len(req.texts) - screened_out
|
|
# Hybrid estimate: embed all + LLM only the uncertain ones
|
|
per_text_embed_ms = embed_ms / max(len(req.texts), 1)
|
|
per_text_llm_ms = llm_ms / max(len(req.texts), 1)
|
|
hybrid_ms = int(embed_ms + needing_llm * per_text_llm_ms)
|
|
|
|
return BenchmarkResult(
|
|
total_texts=len(req.texts),
|
|
embed_time_ms=embed_ms,
|
|
embed_results=[r.model_dump() for r in embed_results],
|
|
llm_time_ms=llm_ms,
|
|
llm_results=[r.model_dump() for r in llm_results],
|
|
agreement_rate=round(agreements / max(len(req.texts), 1), 3),
|
|
speedup=round(llm_ms / max(hybrid_ms, 1), 2),
|
|
texts_screened_out=screened_out,
|
|
texts_needing_llm=needing_llm,
|
|
hybrid_estimated_ms=hybrid_ms,
|
|
)
|
|
|
|
|
|
# ─── API: Pipeline definition & execution ────────────────────
|
|
|
|
class PipelineStage(BaseModel):
|
|
name: str
|
|
mode: str # "screen", "classify", "extract", "validate", "custom"
|
|
config: dict = {} # stage-specific config (threshold, prompt, etc.)
|
|
|
|
|
|
class PipelineDef(BaseModel):
|
|
name: str
|
|
stages: list[PipelineStage]
|
|
description: str = ""
|
|
|
|
|
|
class PipelineRunRequest(BaseModel):
|
|
pipeline_name: str
|
|
texts: list[str]
|
|
|
|
|
|
@router.post("/pipelines")
|
|
async def save_pipeline(pipeline: PipelineDef):
|
|
"""Save a pipeline definition."""
|
|
fp = _pipelines_dir() / f"{pipeline.name}.json"
|
|
fp.write_text(pipeline.model_dump_json(indent=2))
|
|
return {"ok": True, "name": pipeline.name}
|
|
|
|
|
|
@router.get("/pipelines")
|
|
async def list_pipelines():
|
|
"""List saved pipeline definitions."""
|
|
pipelines = []
|
|
for fp in _pipelines_dir().glob("*.json"):
|
|
try:
|
|
data = json.loads(fp.read_text())
|
|
pipelines.append({"name": data["name"], "stages": len(data["stages"]),
|
|
"description": data.get("description", "")})
|
|
except Exception:
|
|
pass
|
|
return {"pipelines": pipelines}
|
|
|
|
|
|
@router.get("/pipelines/{name}")
|
|
async def get_pipeline(name: str):
|
|
fp = _pipelines_dir() / f"{name}.json"
|
|
if not fp.exists():
|
|
raise HTTPException(404, "Pipeline not found")
|
|
return json.loads(fp.read_text())
|
|
|
|
|
|
@router.post("/pipelines/run")
|
|
async def run_pipeline(req: PipelineRunRequest):
|
|
"""Execute a pipeline on a set of texts. Returns per-stage results and timing."""
|
|
fp = _pipelines_dir() / f"{req.pipeline_name}.json"
|
|
if not fp.exists():
|
|
raise HTTPException(404, f"Pipeline '{req.pipeline_name}' not found")
|
|
|
|
pipeline = json.loads(fp.read_text())
|
|
results = {"pipeline": req.pipeline_name, "stages": [], "total_ms": 0}
|
|
current_texts = req.texts[:]
|
|
|
|
for stage_def in pipeline["stages"]:
|
|
stage_name = stage_def["name"]
|
|
mode = stage_def["mode"]
|
|
config = stage_def.get("config", {})
|
|
t0 = time.monotonic()
|
|
stage_result = {"name": stage_name, "mode": mode, "input_count": len(current_texts)}
|
|
|
|
if mode == "screen":
|
|
threshold = config.get("threshold", 0.65)
|
|
screen_res = await screen_texts(ScreenRequest(
|
|
texts=current_texts, threshold=threshold
|
|
))
|
|
passed = [r for r in screen_res if r.above_threshold]
|
|
stage_result["output_count"] = len(passed)
|
|
stage_result["filtered_out"] = len(current_texts) - len(passed)
|
|
stage_result["results"] = [r.model_dump() for r in screen_res]
|
|
# Pass only above-threshold texts to next stage
|
|
current_texts = [r.text for r in screen_res if r.above_threshold]
|
|
|
|
elif mode == "classify":
|
|
cls_res = await classify_texts(ClassifyRequest(
|
|
texts=current_texts,
|
|
categories=config.get("categories"),
|
|
model=config.get("model"),
|
|
))
|
|
stage_result["output_count"] = len(cls_res)
|
|
stage_result["results"] = [r.model_dump() for r in cls_res]
|
|
|
|
elif mode == "extract":
|
|
extract_prompt = config.get("prompt", "Extract key information from this text:")
|
|
extractions = []
|
|
for text in current_texts:
|
|
raw = await _generate(f"{extract_prompt}\n\nTEXT: {text[:800]}")
|
|
extractions.append({"text": text[:200], "extracted": raw})
|
|
stage_result["output_count"] = len(extractions)
|
|
stage_result["results"] = extractions
|
|
|
|
elif mode == "validate":
|
|
# Embedding-based dedup: find near-duplicate results
|
|
if len(current_texts) > 1:
|
|
embs = await _embed_texts(current_texts)
|
|
dupes = []
|
|
threshold = config.get("dedup_threshold", 0.92)
|
|
for i in range(len(embs)):
|
|
for j in range(i + 1, len(embs)):
|
|
sim = cosine_similarity(embs[i], embs[j])
|
|
if sim >= threshold:
|
|
dupes.append({"i": i, "j": j, "similarity": round(sim, 4),
|
|
"text_a": current_texts[i][:100],
|
|
"text_b": current_texts[j][:100]})
|
|
stage_result["duplicates_found"] = len(dupes)
|
|
stage_result["results"] = dupes
|
|
else:
|
|
stage_result["duplicates_found"] = 0
|
|
stage_result["results"] = []
|
|
stage_result["output_count"] = len(current_texts)
|
|
|
|
else:
|
|
stage_result["error"] = f"Unknown mode: {mode}"
|
|
stage_result["output_count"] = len(current_texts)
|
|
|
|
stage_ms = int((time.monotonic() - t0) * 1000)
|
|
stage_result["time_ms"] = stage_ms
|
|
results["stages"].append(stage_result)
|
|
results["total_ms"] += stage_ms
|
|
|
|
return results
|
|
|
|
|
|
# ─── API: REPL cell (free-form eval) ─────────────────────────
|
|
|
|
class CellRequest(BaseModel):
|
|
action: str # "embed", "generate", "similarity", "screen", "classify"
|
|
text: str = ""
|
|
texts: list[str] = []
|
|
params: dict = {}
|
|
|
|
|
|
@router.post("/cell")
|
|
async def run_cell(req: CellRequest):
|
|
"""Execute a single notebook cell. Flexible entry point for ad-hoc operations."""
|
|
t0 = time.monotonic()
|
|
result = {}
|
|
|
|
if req.action == "embed":
|
|
texts = req.texts or ([req.text] if req.text else [])
|
|
embs = await _embed_texts(texts)
|
|
result = {"embeddings_count": len(embs), "dimensions": len(embs[0]) if embs else 0,
|
|
"texts": texts}
|
|
|
|
elif req.action == "generate":
|
|
text = await _generate(req.text, **{k: v for k, v in req.params.items()
|
|
if k in ("model", "temperature")})
|
|
result = {"text": text}
|
|
|
|
elif req.action == "similarity":
|
|
if len(req.texts) < 2:
|
|
raise HTTPException(400, "Need at least 2 texts for similarity")
|
|
embs = await _embed_texts(req.texts)
|
|
matrix = []
|
|
for i in range(len(embs)):
|
|
row = []
|
|
for j in range(len(embs)):
|
|
row.append(round(cosine_similarity(embs[i], embs[j]), 4))
|
|
matrix.append(row)
|
|
result = {"matrix": matrix, "texts": [t[:80] for t in req.texts]}
|
|
|
|
elif req.action == "screen":
|
|
texts = req.texts or ([req.text] if req.text else [])
|
|
threshold = req.params.get("threshold", 0.65)
|
|
res = await screen_texts(ScreenRequest(texts=texts, threshold=threshold))
|
|
result = {"results": [r.model_dump() for r in res]}
|
|
|
|
elif req.action == "classify":
|
|
texts = req.texts or ([req.text] if req.text else [])
|
|
res = await classify_texts(ClassifyRequest(texts=texts))
|
|
result = {"results": [r.model_dump() for r in res]}
|
|
|
|
else:
|
|
raise HTTPException(400, f"Unknown action: {req.action}")
|
|
|
|
result["time_ms"] = int((time.monotonic() - t0) * 1000)
|
|
return result
|