lakehouse/scripts/distillation/scorer.ts

// scorer.ts — pure deterministic Success Scorer.
//
// Takes one EvidenceRecord, returns category + reasons + sub_scores.
// NO I/O, NO LLM, NO clock reads, NO mutable state. The only randomness
// allowed is none. Identical input → identical output forever.
//
// Three-class strategy (see docs/recon/local-distillation-recon.md +
// data/_kb/evidence_health.md for the source taxonomy):
//
//   CLASS A — verdict-bearing
//     scrum_reviews, observer_reviews, audits, contract_analyses
//     Direct scoring from existing markers/observer_verdict
//
//   CLASS B — telemetry-rich
//     auto_apply, outcomes, mode_experiments
//     Markers exist but partial; needs_human_review fills the gap
//
//   CLASS C — pure-extraction (no native scoring signal)
//     distilled_*, audit_facts, observer_escalations
//     Default needs_human_review; v2 will JOIN to parent verdict
//
// scorer_version is stamped on every output. Bumping it lets a
// downstream re-scoring detect drift between historical runs.

import type { EvidenceRecord } from "../../auditor/schemas/distillation/evidence_record";
import type { ScoreCategory, ScoredRun } from "../../auditor/schemas/distillation/scored_run";
import { SCORED_RUN_SCHEMA_VERSION } from "../../auditor/schemas/distillation/scored_run";
import { canonicalSha256 } from "../../auditor/schemas/distillation/types";

// Hardcoded — the deterministic-output contract requires this. Bump the
// literal in the same commit as any scoring-rule change so the version
// stamp moves atomically with logic. Env override removed 2026-04-27
// after Kimi audit flagged identical-input-different-version drift.
export const SCORER_VERSION = "v1.0.0";

export interface ScoreOutput {
  category: ScoreCategory;
  reasons: string[];
  sub_scores: ScoredRun["sub_scores"];
}

// Map source_file (from provenance) → source class. Centralized so
// adding a new source is one-line.
type SourceClass = "verdict" | "telemetry" | "extraction";

function sourceClassFor(source_file: string): SourceClass {
  // Strip data/_kb/ prefix and .jsonl suffix to compare by stem
  const stem = source_file.replace(/^data\/_kb\//, "").replace(/\.jsonl$/, "");
  switch (stem) {
    case "scrum_reviews":
    case "observer_reviews":
    case "audits":
    case "contract_analyses":
      return "verdict";
    case "auto_apply":
    case "outcomes":
    case "mode_experiments":
      return "telemetry";
    case "distilled_facts":
    case "distilled_procedures":
    case "distilled_config_hints":
    case "audit_facts":
    case "observer_escalations":
      return "extraction";
    default:
      // Unknown source — route to extraction (most conservative —
      // forces needs_human_review until a transform is added).
      return "extraction";
  }
}

// ─── Class A: verdict-bearing ─────────────────────────────────────

function scoreScrumReview(r: EvidenceRecord): ScoreOutput {
  const reasons: string[] = [];
  const subs: ScoredRun["sub_scores"] = {};

  const successMarker = (r.success_markers ?? []).find(m => m.startsWith("accepted_on_attempt_"));
  if (!successMarker) {
    reasons.push("scrum_review missing accepted_on_attempt_* success marker");
    return { category: "needs_human_review", reasons, sub_scores: subs };
  }
  const attempt = Number(successMarker.replace("accepted_on_attempt_", ""));
  subs.accepted_on_attempt = attempt;
  if (attempt === 1) {
    reasons.push("scrum: accepted on first attempt");
    return { category: "accepted", reasons, sub_scores: subs };
  }
  if (attempt <= 3) {
    reasons.push(`scrum: accepted after ${attempt} attempts`);
    return { category: "partially_accepted", reasons, sub_scores: subs };
  }
  reasons.push(`scrum: accepted only after ${attempt} attempts (high-cost path)`);
  return { category: "partially_accepted", reasons, sub_scores: subs };
}

function scoreObserverReview(r: EvidenceRecord): ScoreOutput {
  const reasons: string[] = [];
  const subs: ScoredRun["sub_scores"] = {};
  const v = r.observer_verdict;
  if (v === "accept") {
    subs.observer_verdict = "accept";
    reasons.push("observer accepted the reviewed attempt");
    return { category: "accepted", reasons, sub_scores: subs };
  }
  if (v === "reject") {
    subs.observer_verdict = "reject";
    reasons.push("observer rejected the reviewed attempt");
    return { category: "rejected", reasons, sub_scores: subs };
  }
  if (v === "cycle") {
    subs.observer_verdict = "cycle";
    reasons.push("observer flagged the attempt as cycling — partial signal");
    return { category: "partially_accepted", reasons, sub_scores: subs };
  }
  reasons.push(`observer_verdict missing or unrecognized: ${JSON.stringify(v ?? null)}`);
  return { category: "needs_human_review", reasons, sub_scores: subs };
}

function scoreAudit(r: EvidenceRecord): ScoreOutput {
  // audits.jsonl is the auditor's per-finding stream (not PR verdicts).
  // Phase 2 transform encodes severity into markers:
  //   audit_severity_{info,low}      → accepted (minor finding)
  //   audit_severity_medium          → partially_accepted
  //   audit_severity_{high,critical} → rejected (real problem)
  // Older "approved"/"blocked"/"request_changes" markers also handled
  // for back-compat with any pre-fix materializations on disk.
  const reasons: string[] = [];
  const subs: ScoredRun["sub_scores"] = {};
  const succ = r.success_markers ?? [];
  const fail = r.failure_markers ?? [];

  if (succ.includes("approved")) {
    reasons.push("audit overall=approved (legacy marker)");
    return { category: "accepted", reasons, sub_scores: subs };
  }
  if (fail.includes("blocked")) {
    reasons.push("audit overall=block (legacy marker)");
    return { category: "rejected", reasons, sub_scores: subs };
  }
  if (fail.includes("request_changes")) {
    reasons.push("audit overall=request_changes (legacy marker)");
    return { category: "partially_accepted", reasons, sub_scores: subs };
  }

  // Severity-derived markers (current Phase 2 transform):
  const sevSucc = succ.find(m => m.startsWith("audit_severity_"));
  const sevFail = fail.find(m => m.startsWith("audit_severity_"));
  if (sevSucc) {
    reasons.push(`${sevSucc} → minor finding`);
    return { category: "accepted", reasons, sub_scores: subs };
  }
  if (sevFail === "audit_severity_medium") {
    reasons.push("audit_severity_medium → finding warrants review");
    return { category: "partially_accepted", reasons, sub_scores: subs };
  }
  if (sevFail === "audit_severity_high" || sevFail === "audit_severity_critical") {
    reasons.push(`${sevFail} → blocking finding`);
    return { category: "rejected", reasons, sub_scores: subs };
  }

  reasons.push("audit row has no severity or overall marker");
  return { category: "needs_human_review", reasons, sub_scores: subs };
}

function scoreContractAnalysis(r: EvidenceRecord): ScoreOutput {
  const reasons: string[] = [];
  const subs: ScoredRun["sub_scores"] = {};
  const v = r.observer_verdict;
  // failure_markers takes precedence: explicit rejection beats absent verdict
  if ((r.failure_markers ?? []).includes("observer_rejected") || v === "reject") {
    subs.observer_verdict = "reject";
    reasons.push("contract analysis: observer rejected");
    return { category: "rejected", reasons, sub_scores: subs };
  }
  if (v === "accept") {
    subs.observer_verdict = "accept";
    reasons.push("contract analysis: observer accepted");
    return { category: "accepted", reasons, sub_scores: subs };
  }
  if (v === "cycle") {
    subs.observer_verdict = "cycle";
    reasons.push("contract analysis: observer cycled (partial)");
    return { category: "partially_accepted", reasons, sub_scores: subs };
  }
  reasons.push("contract analysis: no observer verdict signal");
  return { category: "needs_human_review", reasons, sub_scores: subs };
}

// ─── Class B: telemetry-rich ──────────────────────────────────────

function scoreAutoApply(r: EvidenceRecord): ScoreOutput {
  const reasons: string[] = [];
  const subs: ScoredRun["sub_scores"] = {};
  if ((r.success_markers ?? []).includes("committed")) {
    subs.cargo_green = true;
    reasons.push("auto_apply: patch committed (cargo green + warning baseline + rationale alignment passed)");
    return { category: "accepted", reasons, sub_scores: subs };
  }
  const failures = (r.failure_markers ?? []);
  const reverted = failures.find(f => f.includes("reverted"));
  if (reverted) {
    if (reverted.includes("build_red")) subs.cargo_green = false;
    reasons.push(`auto_apply: ${reverted}`);
    return { category: "rejected", reasons, sub_scores: subs };
  }
  // no_patches / dry_run / all_rejected — not a failure of code, but no commit either
  reasons.push("auto_apply: no commit + no revert (likely no_patches or dry_run)");
  return { category: "needs_human_review", reasons, sub_scores: subs };
}

function scoreOutcomes(r: EvidenceRecord): ScoreOutput {
  const reasons: string[] = [];
  const subs: ScoredRun["sub_scores"] = {};
  if ((r.success_markers ?? []).includes("all_events_ok")) {
    reasons.push("outcomes: all events ok");
    return { category: "accepted", reasons, sub_scores: subs };
  }
  // Validation results may carry partial signal
  const gap = r.validation_results?.gap_signals as number | undefined;
  if (typeof gap === "number" && gap > 0) {
    reasons.push(`outcomes: ${gap} gap signal(s) detected`);
    return { category: "partially_accepted", reasons, sub_scores: subs };
  }
  reasons.push("outcomes: no decisive marker — defer to human");
  return { category: "needs_human_review", reasons, sub_scores: subs };
}

function scoreModeExperiment(r: EvidenceRecord): ScoreOutput {
  const reasons: string[] = [];
  const subs: ScoredRun["sub_scores"] = {};
  // mode_experiments at Phase 2 lacks markers (transform doesn't derive
  // them yet). v1 derivation: a non-empty response with reasonable
  // latency is at least partially_accepted; otherwise needs_human_review.
  // Anything stronger needs the grounding-from-mode_compare hook in
  // Phase 4 / re-scoring.
  if (typeof r.text !== "string" || r.text.trim().length === 0) {
    reasons.push("mode_experiment: empty response text");
    return { category: "rejected", reasons, sub_scores: subs };
  }
  if (typeof r.latency_ms === "number" && r.latency_ms > 120_000) {
    reasons.push(`mode_experiment: latency ${r.latency_ms}ms exceeds 2-minute soft cap`);
    return { category: "partially_accepted", reasons, sub_scores: subs };
  }
  reasons.push("mode_experiment: response present, latency within bounds; verdict not yet wired");
  return { category: "needs_human_review", reasons, sub_scores: subs };
}

// ─── Class C: pure-extraction ────────────────────────────────────

function scoreExtraction(r: EvidenceRecord): ScoreOutput {
  // Phase 3 v1: extraction-class records have no native scoring
  // signal. Default to needs_human_review with an explicit reason.
  // Phase 3 v2 will JOIN to a parent verdict-bearing record.
  const reasons = ["extraction-class source has no native scoring signal — JOIN to parent verdict pending Phase 3 v2"];
  return { category: "needs_human_review", reasons, sub_scores: {} };
}

// ─── Dispatch ─────────────────────────────────────────────────────

export function scoreRecord(record: EvidenceRecord): ScoreOutput {
  const cls = sourceClassFor(record.provenance.source_file);
  const stem = record.provenance.source_file.replace(/^data\/_kb\//, "").replace(/\.jsonl$/, "");

  if (cls === "verdict") {
    if (stem === "scrum_reviews") return scoreScrumReview(record);
    if (stem === "observer_reviews") return scoreObserverReview(record);
    if (stem === "audits") return scoreAudit(record);
    if (stem === "contract_analyses") return scoreContractAnalysis(record);
  }
  if (cls === "telemetry") {
    if (stem === "auto_apply") return scoreAutoApply(record);
    if (stem === "outcomes") return scoreOutcomes(record);
    if (stem === "mode_experiments") return scoreModeExperiment(record);
  }
  return scoreExtraction(record);
}

// Build a complete ScoredRun. Caller supplies recorded_at + the
// source file / line offset to populate provenance.
export async function buildScoredRun(
  record: EvidenceRecord,
  source_file_relpath: string,
  line_offset: number,
  recorded_at: string,
): Promise<ScoredRun> {
  const out = scoreRecord(record);
  // Compute provenance.sig_hash over the EvidenceRecord (not raw source);
  // ScoredRun traces to the materialized evidence row, not the raw stream.
  const sig_hash = await canonicalSha256(record);
  return {
    schema_version: SCORED_RUN_SCHEMA_VERSION,
    evidence_run_id: record.run_id,
    evidence_task_id: record.task_id,
    category: out.category,
    reasons: out.reasons,
    scored_at: recorded_at,
    scorer_version: SCORER_VERSION,
    sub_scores: out.sub_scores,
    provenance: {
      source_file: source_file_relpath,
      line_offset,
      sig_hash,
      recorded_at,
    },
  };
}