lakehouse/sidecar/sidecar/pipeline_lab.py

"""Pipeline Lab — iterative embedding/LLM pipeline experimentation.

Provides:
- Exemplar-based embedding classification (fast screening)
- LLM-based classification (accurate but slow)
- A/B benchmarking between the two
- Pipeline definition and execution
- Notebook-style API for interactive experimentation
"""

import json
import math
import os
import time
from pathlib import Path
from typing import Optional

from fastapi import APIRouter, HTTPException
from fastapi.responses import HTMLResponse
from pydantic import BaseModel

from .ollama import client

router = APIRouter()

EMBED_MODEL = os.environ.get("EMBED_MODEL", "nomic-embed-text")
GEN_MODEL = os.environ.get("GEN_MODEL", "qwen2.5")
LAB_DIR = Path(os.environ.get("LAB_DIR", "./data/_pipeline_lab"))
LAB_DIR.mkdir(parents=True, exist_ok=True)


# ─── Vector math ─────────────────────────────────────────────

def cosine_similarity(a: list[float], b: list[float]) -> float:
    dot = sum(x * y for x, y in zip(a, b))
    norm_a = math.sqrt(sum(x * x for x in a))
    norm_b = math.sqrt(sum(x * x for x in b))
    if norm_a == 0 or norm_b == 0:
        return 0.0
    return dot / (norm_a * norm_b)


# ─── Exemplar store ──────────────────────────────────────────
# Exemplars are labeled text+embedding pairs used for classification.
# e.g. category="decision" texts=["We decided to use Parquet", "The team chose React"]

_exemplars: dict[str, list[dict]] = {}  # category -> [{text, embedding}]


def _exemplar_file() -> Path:
    return LAB_DIR / "exemplars.json"


def _load_exemplars():
    global _exemplars
    fp = _exemplar_file()
    if fp.exists():
        data = json.loads(fp.read_text())
        _exemplars = data
    return _exemplars


def _save_exemplars():
    _exemplar_file().write_text(json.dumps(_exemplars, indent=2))


_load_exemplars()


# ─── Pipeline store ──────────────────────────────────────────

def _pipelines_dir() -> Path:
    d = LAB_DIR / "pipelines"
    d.mkdir(exist_ok=True)
    return d


# ─── Embedding helper ────────────────────────────────────────

async def _embed_texts(texts: list[str], model: str = EMBED_MODEL) -> list[list[float]]:
    embeddings = []
    async with client() as c:
        for text in texts:
            resp = await c.post("/api/embed", json={"model": model, "input": text})
            if resp.status_code != 200:
                raise HTTPException(502, f"Ollama embed error: {resp.text}")
            data = resp.json()
            embeddings.extend(data.get("embeddings", []))
    return embeddings


async def _generate(prompt: str, model: str = GEN_MODEL, temperature: float = 0.3) -> str:
    async with client() as c:
        resp = await c.post("/api/generate", json={
            "model": model, "prompt": prompt, "stream": False,
            "options": {"temperature": temperature, "num_predict": 1024}
        })
        if resp.status_code != 200:
            raise HTTPException(502, f"Ollama generate error: {resp.text}")
        return resp.json().get("response", "")


# ─── API: Exemplars ──────────────────────────────────────────

class ExemplarAdd(BaseModel):
    category: str
    texts: list[str]


class ExemplarList(BaseModel):
    categories: dict[str, int]  # category -> count


@router.post("/exemplars")
async def add_exemplars(req: ExemplarAdd):
    """Add labeled exemplar texts for a category. Embeddings generated automatically."""
    category = req.category.strip().lower()
    if not category or not req.texts:
        raise HTTPException(400, "category and texts required")

    embeddings = await _embed_texts(req.texts)

    if category not in _exemplars:
        _exemplars[category] = []

    for text, emb in zip(req.texts, embeddings):
        _exemplars[category].append({"text": text, "embedding": emb})

    _save_exemplars()
    return {"ok": True, "category": category, "added": len(req.texts),
            "total": len(_exemplars[category])}


@router.get("/exemplars")
async def list_exemplars():
    """List all exemplar categories and counts."""
    return {"categories": {k: len(v) for k, v in _exemplars.items()},
            "total": sum(len(v) for v in _exemplars.values())}


@router.delete("/exemplars/{category}")
async def delete_exemplar_category(category: str):
    if category in _exemplars:
        del _exemplars[category]
        _save_exemplars()
    return {"ok": True}


# ─── API: Screen (embedding-based classification) ────────────

class ScreenRequest(BaseModel):
    texts: list[str]
    threshold: float = 0.65
    top_k: int = 1


class ScreenResult(BaseModel):
    text: str
    best_category: str | None
    similarity: float
    above_threshold: bool
    all_scores: dict[str, float]


@router.post("/screen", response_model=list[ScreenResult])
async def screen_texts(req: ScreenRequest):
    """Classify texts by cosine similarity to exemplar embeddings (fast path)."""
    if not _exemplars:
        raise HTTPException(400, "No exemplars defined. Add exemplars first.")

    embeddings = await _embed_texts(req.texts)
    results = []

    for text, emb in zip(req.texts, embeddings):
        category_scores = {}
        for category, exemplar_list in _exemplars.items():
            sims = [cosine_similarity(emb, ex["embedding"]) for ex in exemplar_list]
            category_scores[category] = max(sims) if sims else 0.0

        best_cat = max(category_scores, key=category_scores.get) if category_scores else None
        best_sim = category_scores.get(best_cat, 0.0) if best_cat else 0.0

        results.append(ScreenResult(
            text=text[:200],
            best_category=best_cat if best_sim >= req.threshold else None,
            similarity=round(best_sim, 4),
            above_threshold=best_sim >= req.threshold,
            all_scores={k: round(v, 4) for k, v in sorted(category_scores.items(),
                        key=lambda x: x[1], reverse=True)},
        ))

    return results


# ─── API: Classify (LLM-based classification) ────────────────

class ClassifyRequest(BaseModel):
    texts: list[str]
    categories: list[str] | None = None  # if None, use exemplar category names
    model: str | None = None


class ClassifyResult(BaseModel):
    text: str
    category: str
    confidence: str
    reasoning: str


@router.post("/classify", response_model=list[ClassifyResult])
async def classify_texts(req: ClassifyRequest):
    """Classify texts using LLM (slow but accurate path)."""
    categories = req.categories or list(_exemplars.keys())
    if not categories:
        raise HTTPException(400, "No categories. Provide categories or add exemplars.")

    model = req.model or GEN_MODEL
    results = []

    for text in req.texts:
        prompt = (
            f"Classify this text into exactly ONE of these categories: {', '.join(categories)}\n\n"
            f"TEXT: {text[:500]}\n\n"
            f"Respond with JSON: {{\"category\": \"...\", \"confidence\": \"high|medium|low\", "
            f"\"reasoning\": \"one sentence\"}}"
        )
        raw = await _generate(prompt, model=model, temperature=0.1)

        # Parse
        try:
            j_s, j_e = raw.find("{"), raw.rfind("}") + 1
            parsed = json.loads(raw[j_s:j_e]) if j_s >= 0 and j_e > j_s else {}
        except Exception:
            parsed = {}

        results.append(ClassifyResult(
            text=text[:200],
            category=parsed.get("category", "unknown"),
            confidence=parsed.get("confidence", "low"),
            reasoning=parsed.get("reasoning", raw[:200]),
        ))

    return results


# ─── API: Benchmark (A/B comparison) ─────────────────────────

class BenchmarkRequest(BaseModel):
    texts: list[str]
    threshold: float = 0.65
    model: str | None = None


class BenchmarkResult(BaseModel):
    total_texts: int
    # Embedding path
    embed_time_ms: int
    embed_results: list[dict]
    # LLM path
    llm_time_ms: int
    llm_results: list[dict]
    # Comparison
    agreement_rate: float
    speedup: float
    texts_screened_out: int
    texts_needing_llm: int
    hybrid_estimated_ms: int


@router.post("/benchmark", response_model=BenchmarkResult)
async def benchmark(req: BenchmarkRequest):
    """Run same texts through embedding screening and LLM classification. Compare."""
    if not _exemplars:
        raise HTTPException(400, "No exemplars. Add exemplars first.")

    categories = list(_exemplars.keys())

    # Embedding path
    t0 = time.monotonic()
    embed_results = await screen_texts(ScreenRequest(
        texts=req.texts, threshold=req.threshold
    ))
    embed_ms = int((time.monotonic() - t0) * 1000)

    # LLM path
    t0 = time.monotonic()
    llm_results = await classify_texts(ClassifyRequest(
        texts=req.texts, categories=categories, model=req.model
    ))
    llm_ms = int((time.monotonic() - t0) * 1000)

    # Compare
    agreements = 0
    screened_out = 0
    for er, lr in zip(embed_results, llm_results):
        if not er.above_threshold:
            screened_out += 1
        if er.best_category == lr.category:
            agreements += 1

    needing_llm = len(req.texts) - screened_out
    # Hybrid estimate: embed all + LLM only the uncertain ones
    per_text_embed_ms = embed_ms / max(len(req.texts), 1)
    per_text_llm_ms = llm_ms / max(len(req.texts), 1)
    hybrid_ms = int(embed_ms + needing_llm * per_text_llm_ms)

    return BenchmarkResult(
        total_texts=len(req.texts),
        embed_time_ms=embed_ms,
        embed_results=[r.model_dump() for r in embed_results],
        llm_time_ms=llm_ms,
        llm_results=[r.model_dump() for r in llm_results],
        agreement_rate=round(agreements / max(len(req.texts), 1), 3),
        speedup=round(llm_ms / max(hybrid_ms, 1), 2),
        texts_screened_out=screened_out,
        texts_needing_llm=needing_llm,
        hybrid_estimated_ms=hybrid_ms,
    )


# ─── API: Pipeline definition & execution ────────────────────

class PipelineStage(BaseModel):
    name: str
    mode: str  # "screen", "classify", "extract", "validate", "custom"
    config: dict = {}  # stage-specific config (threshold, prompt, etc.)


class PipelineDef(BaseModel):
    name: str
    stages: list[PipelineStage]
    description: str = ""


class PipelineRunRequest(BaseModel):
    pipeline_name: str
    texts: list[str]


@router.post("/pipelines")
async def save_pipeline(pipeline: PipelineDef):
    """Save a pipeline definition."""
    fp = _pipelines_dir() / f"{pipeline.name}.json"
    fp.write_text(pipeline.model_dump_json(indent=2))
    return {"ok": True, "name": pipeline.name}


@router.get("/pipelines")
async def list_pipelines():
    """List saved pipeline definitions."""
    pipelines = []
    for fp in _pipelines_dir().glob("*.json"):
        try:
            data = json.loads(fp.read_text())
            pipelines.append({"name": data["name"], "stages": len(data["stages"]),
                             "description": data.get("description", "")})
        except Exception:
            pass
    return {"pipelines": pipelines}


@router.get("/pipelines/{name}")
async def get_pipeline(name: str):
    fp = _pipelines_dir() / f"{name}.json"
    if not fp.exists():
        raise HTTPException(404, "Pipeline not found")
    return json.loads(fp.read_text())


@router.post("/pipelines/run")
async def run_pipeline(req: PipelineRunRequest):
    """Execute a pipeline on a set of texts. Returns per-stage results and timing."""
    fp = _pipelines_dir() / f"{req.pipeline_name}.json"
    if not fp.exists():
        raise HTTPException(404, f"Pipeline '{req.pipeline_name}' not found")

    pipeline = json.loads(fp.read_text())
    results = {"pipeline": req.pipeline_name, "stages": [], "total_ms": 0}
    current_texts = req.texts[:]

    for stage_def in pipeline["stages"]:
        stage_name = stage_def["name"]
        mode = stage_def["mode"]
        config = stage_def.get("config", {})
        t0 = time.monotonic()
        stage_result = {"name": stage_name, "mode": mode, "input_count": len(current_texts)}

        if mode == "screen":
            threshold = config.get("threshold", 0.65)
            screen_res = await screen_texts(ScreenRequest(
                texts=current_texts, threshold=threshold
            ))
            passed = [r for r in screen_res if r.above_threshold]
            stage_result["output_count"] = len(passed)
            stage_result["filtered_out"] = len(current_texts) - len(passed)
            stage_result["results"] = [r.model_dump() for r in screen_res]
            # Pass only above-threshold texts to next stage
            current_texts = [r.text for r in screen_res if r.above_threshold]

        elif mode == "classify":
            cls_res = await classify_texts(ClassifyRequest(
                texts=current_texts,
                categories=config.get("categories"),
                model=config.get("model"),
            ))
            stage_result["output_count"] = len(cls_res)
            stage_result["results"] = [r.model_dump() for r in cls_res]

        elif mode == "extract":
            extract_prompt = config.get("prompt", "Extract key information from this text:")
            extractions = []
            for text in current_texts:
                raw = await _generate(f"{extract_prompt}\n\nTEXT: {text[:800]}")
                extractions.append({"text": text[:200], "extracted": raw})
            stage_result["output_count"] = len(extractions)
            stage_result["results"] = extractions

        elif mode == "validate":
            # Embedding-based dedup: find near-duplicate results
            if len(current_texts) > 1:
                embs = await _embed_texts(current_texts)
                dupes = []
                threshold = config.get("dedup_threshold", 0.92)
                for i in range(len(embs)):
                    for j in range(i + 1, len(embs)):
                        sim = cosine_similarity(embs[i], embs[j])
                        if sim >= threshold:
                            dupes.append({"i": i, "j": j, "similarity": round(sim, 4),
                                         "text_a": current_texts[i][:100],
                                         "text_b": current_texts[j][:100]})
                stage_result["duplicates_found"] = len(dupes)
                stage_result["results"] = dupes
            else:
                stage_result["duplicates_found"] = 0
                stage_result["results"] = []
            stage_result["output_count"] = len(current_texts)

        else:
            stage_result["error"] = f"Unknown mode: {mode}"
            stage_result["output_count"] = len(current_texts)

        stage_ms = int((time.monotonic() - t0) * 1000)
        stage_result["time_ms"] = stage_ms
        results["stages"].append(stage_result)
        results["total_ms"] += stage_ms

    return results


# ─── API: REPL cell (free-form eval) ─────────────────────────

class CellRequest(BaseModel):
    action: str  # "embed", "generate", "similarity", "screen", "classify"
    text: str = ""
    texts: list[str] = []
    params: dict = {}


@router.post("/cell")
async def run_cell(req: CellRequest):
    """Execute a single notebook cell. Flexible entry point for ad-hoc operations."""
    t0 = time.monotonic()
    result = {}

    if req.action == "embed":
        texts = req.texts or ([req.text] if req.text else [])
        embs = await _embed_texts(texts)
        result = {"embeddings_count": len(embs), "dimensions": len(embs[0]) if embs else 0,
                  "texts": texts}

    elif req.action == "generate":
        text = await _generate(req.text, **{k: v for k, v in req.params.items()
                                             if k in ("model", "temperature")})
        result = {"text": text}

    elif req.action == "similarity":
        if len(req.texts) < 2:
            raise HTTPException(400, "Need at least 2 texts for similarity")
        embs = await _embed_texts(req.texts)
        matrix = []
        for i in range(len(embs)):
            row = []
            for j in range(len(embs)):
                row.append(round(cosine_similarity(embs[i], embs[j]), 4))
            matrix.append(row)
        result = {"matrix": matrix, "texts": [t[:80] for t in req.texts]}

    elif req.action == "screen":
        texts = req.texts or ([req.text] if req.text else [])
        threshold = req.params.get("threshold", 0.65)
        res = await screen_texts(ScreenRequest(texts=texts, threshold=threshold))
        result = {"results": [r.model_dump() for r in res]}

    elif req.action == "classify":
        texts = req.texts or ([req.text] if req.text else [])
        res = await classify_texts(ClassifyRequest(texts=texts))
        result = {"results": [r.model_dump() for r in res]}

    else:
        raise HTTPException(400, f"Unknown action: {req.action}")

    result["time_ms"] = int((time.monotonic() - t0) * 1000)
    return result