// scorer.ts — pure deterministic Success Scorer. // // Takes one EvidenceRecord, returns category + reasons + sub_scores. // NO I/O, NO LLM, NO clock reads, NO mutable state. The only randomness // allowed is none. Identical input → identical output forever. // // Three-class strategy (see docs/recon/local-distillation-recon.md + // data/_kb/evidence_health.md for the source taxonomy): // // CLASS A — verdict-bearing // scrum_reviews, observer_reviews, audits, contract_analyses // Direct scoring from existing markers/observer_verdict // // CLASS B — telemetry-rich // auto_apply, outcomes, mode_experiments // Markers exist but partial; needs_human_review fills the gap // // CLASS C — pure-extraction (no native scoring signal) // distilled_*, audit_facts, observer_escalations // Default needs_human_review; v2 will JOIN to parent verdict // // scorer_version is stamped on every output. Bumping it lets a // downstream re-scoring detect drift between historical runs. import type { EvidenceRecord } from "../../auditor/schemas/distillation/evidence_record"; import type { ScoreCategory, ScoredRun } from "../../auditor/schemas/distillation/scored_run"; import { SCORED_RUN_SCHEMA_VERSION } from "../../auditor/schemas/distillation/scored_run"; import { canonicalSha256 } from "../../auditor/schemas/distillation/types"; export const SCORER_VERSION = process.env.LH_SCORER_VERSION ?? "v1.0.0"; export interface ScoreOutput { category: ScoreCategory; reasons: string[]; sub_scores: ScoredRun["sub_scores"]; } // Map source_file (from provenance) → source class. Centralized so // adding a new source is one-line. type SourceClass = "verdict" | "telemetry" | "extraction"; function sourceClassFor(source_file: string): SourceClass { // Strip data/_kb/ prefix and .jsonl suffix to compare by stem const stem = source_file.replace(/^data\/_kb\//, "").replace(/\.jsonl$/, ""); switch (stem) { case "scrum_reviews": case "observer_reviews": case "audits": case "contract_analyses": return "verdict"; case "auto_apply": case "outcomes": case "mode_experiments": return "telemetry"; case "distilled_facts": case "distilled_procedures": case "distilled_config_hints": case "audit_facts": case "observer_escalations": return "extraction"; default: // Unknown source — route to extraction (most conservative — // forces needs_human_review until a transform is added). return "extraction"; } } // ─── Class A: verdict-bearing ───────────────────────────────────── function scoreScrumReview(r: EvidenceRecord): ScoreOutput { const reasons: string[] = []; const subs: ScoredRun["sub_scores"] = {}; const successMarker = (r.success_markers ?? []).find(m => m.startsWith("accepted_on_attempt_")); if (!successMarker) { reasons.push("scrum_review missing accepted_on_attempt_* success marker"); return { category: "needs_human_review", reasons, sub_scores: subs }; } const attempt = Number(successMarker.replace("accepted_on_attempt_", "")); subs.accepted_on_attempt = attempt; if (attempt === 1) { reasons.push("scrum: accepted on first attempt"); return { category: "accepted", reasons, sub_scores: subs }; } if (attempt <= 3) { reasons.push(`scrum: accepted after ${attempt} attempts`); return { category: "partially_accepted", reasons, sub_scores: subs }; } reasons.push(`scrum: accepted only after ${attempt} attempts (high-cost path)`); return { category: "partially_accepted", reasons, sub_scores: subs }; } function scoreObserverReview(r: EvidenceRecord): ScoreOutput { const reasons: string[] = []; const subs: ScoredRun["sub_scores"] = {}; const v = r.observer_verdict; if (v === "accept") { subs.observer_verdict = "accept"; reasons.push("observer accepted the reviewed attempt"); return { category: "accepted", reasons, sub_scores: subs }; } if (v === "reject") { subs.observer_verdict = "reject"; reasons.push("observer rejected the reviewed attempt"); return { category: "rejected", reasons, sub_scores: subs }; } if (v === "cycle") { subs.observer_verdict = "cycle"; reasons.push("observer flagged the attempt as cycling — partial signal"); return { category: "partially_accepted", reasons, sub_scores: subs }; } reasons.push(`observer_verdict missing or unrecognized: ${JSON.stringify(v ?? null)}`); return { category: "needs_human_review", reasons, sub_scores: subs }; } function scoreAudit(r: EvidenceRecord): ScoreOutput { // audits.jsonl is the auditor's per-finding stream (not PR verdicts). // Phase 2 transform encodes severity into markers: // audit_severity_{info,low} → accepted (minor finding) // audit_severity_medium → partially_accepted // audit_severity_{high,critical} → rejected (real problem) // Older "approved"/"blocked"/"request_changes" markers also handled // for back-compat with any pre-fix materializations on disk. const reasons: string[] = []; const subs: ScoredRun["sub_scores"] = {}; const succ = r.success_markers ?? []; const fail = r.failure_markers ?? []; if (succ.includes("approved")) { reasons.push("audit overall=approved (legacy marker)"); return { category: "accepted", reasons, sub_scores: subs }; } if (fail.includes("blocked")) { reasons.push("audit overall=block (legacy marker)"); return { category: "rejected", reasons, sub_scores: subs }; } if (fail.includes("request_changes")) { reasons.push("audit overall=request_changes (legacy marker)"); return { category: "partially_accepted", reasons, sub_scores: subs }; } // Severity-derived markers (current Phase 2 transform): const sevSucc = succ.find(m => m.startsWith("audit_severity_")); const sevFail = fail.find(m => m.startsWith("audit_severity_")); if (sevSucc) { reasons.push(`${sevSucc} → minor finding`); return { category: "accepted", reasons, sub_scores: subs }; } if (sevFail === "audit_severity_medium") { reasons.push("audit_severity_medium → finding warrants review"); return { category: "partially_accepted", reasons, sub_scores: subs }; } if (sevFail === "audit_severity_high" || sevFail === "audit_severity_critical") { reasons.push(`${sevFail} → blocking finding`); return { category: "rejected", reasons, sub_scores: subs }; } reasons.push("audit row has no severity or overall marker"); return { category: "needs_human_review", reasons, sub_scores: subs }; } function scoreContractAnalysis(r: EvidenceRecord): ScoreOutput { const reasons: string[] = []; const subs: ScoredRun["sub_scores"] = {}; const v = r.observer_verdict; // failure_markers takes precedence: explicit rejection beats absent verdict if ((r.failure_markers ?? []).includes("observer_rejected") || v === "reject") { subs.observer_verdict = "reject"; reasons.push("contract analysis: observer rejected"); return { category: "rejected", reasons, sub_scores: subs }; } if (v === "accept") { subs.observer_verdict = "accept"; reasons.push("contract analysis: observer accepted"); return { category: "accepted", reasons, sub_scores: subs }; } if (v === "cycle") { subs.observer_verdict = "cycle"; reasons.push("contract analysis: observer cycled (partial)"); return { category: "partially_accepted", reasons, sub_scores: subs }; } reasons.push("contract analysis: no observer verdict signal"); return { category: "needs_human_review", reasons, sub_scores: subs }; } // ─── Class B: telemetry-rich ────────────────────────────────────── function scoreAutoApply(r: EvidenceRecord): ScoreOutput { const reasons: string[] = []; const subs: ScoredRun["sub_scores"] = {}; if ((r.success_markers ?? []).includes("committed")) { subs.cargo_green = true; reasons.push("auto_apply: patch committed (cargo green + warning baseline + rationale alignment passed)"); return { category: "accepted", reasons, sub_scores: subs }; } const failures = (r.failure_markers ?? []); const reverted = failures.find(f => f.includes("reverted")); if (reverted) { if (reverted.includes("build_red")) subs.cargo_green = false; reasons.push(`auto_apply: ${reverted}`); return { category: "rejected", reasons, sub_scores: subs }; } // no_patches / dry_run / all_rejected — not a failure of code, but no commit either reasons.push("auto_apply: no commit + no revert (likely no_patches or dry_run)"); return { category: "needs_human_review", reasons, sub_scores: subs }; } function scoreOutcomes(r: EvidenceRecord): ScoreOutput { const reasons: string[] = []; const subs: ScoredRun["sub_scores"] = {}; if ((r.success_markers ?? []).includes("all_events_ok")) { reasons.push("outcomes: all events ok"); return { category: "accepted", reasons, sub_scores: subs }; } // Validation results may carry partial signal const gap = r.validation_results?.gap_signals as number | undefined; if (typeof gap === "number" && gap > 0) { reasons.push(`outcomes: ${gap} gap signal(s) detected`); return { category: "partially_accepted", reasons, sub_scores: subs }; } reasons.push("outcomes: no decisive marker — defer to human"); return { category: "needs_human_review", reasons, sub_scores: subs }; } function scoreModeExperiment(r: EvidenceRecord): ScoreOutput { const reasons: string[] = []; const subs: ScoredRun["sub_scores"] = {}; // mode_experiments at Phase 2 lacks markers (transform doesn't derive // them yet). v1 derivation: a non-empty response with reasonable // latency is at least partially_accepted; otherwise needs_human_review. // Anything stronger needs the grounding-from-mode_compare hook in // Phase 4 / re-scoring. if (typeof r.text !== "string" || r.text.trim().length === 0) { reasons.push("mode_experiment: empty response text"); return { category: "rejected", reasons, sub_scores: subs }; } if (typeof r.latency_ms === "number" && r.latency_ms > 120_000) { reasons.push(`mode_experiment: latency ${r.latency_ms}ms exceeds 2-minute soft cap`); return { category: "partially_accepted", reasons, sub_scores: subs }; } reasons.push("mode_experiment: response present, latency within bounds; verdict not yet wired"); return { category: "needs_human_review", reasons, sub_scores: subs }; } // ─── Class C: pure-extraction ──────────────────────────────────── function scoreExtraction(r: EvidenceRecord): ScoreOutput { // Phase 3 v1: extraction-class records have no native scoring // signal. Default to needs_human_review with an explicit reason. // Phase 3 v2 will JOIN to a parent verdict-bearing record. const reasons = ["extraction-class source has no native scoring signal — JOIN to parent verdict pending Phase 3 v2"]; return { category: "needs_human_review", reasons, sub_scores: {} }; } // ─── Dispatch ───────────────────────────────────────────────────── export function scoreRecord(record: EvidenceRecord): ScoreOutput { const cls = sourceClassFor(record.provenance.source_file); const stem = record.provenance.source_file.replace(/^data\/_kb\//, "").replace(/\.jsonl$/, ""); if (cls === "verdict") { if (stem === "scrum_reviews") return scoreScrumReview(record); if (stem === "observer_reviews") return scoreObserverReview(record); if (stem === "audits") return scoreAudit(record); if (stem === "contract_analyses") return scoreContractAnalysis(record); } if (cls === "telemetry") { if (stem === "auto_apply") return scoreAutoApply(record); if (stem === "outcomes") return scoreOutcomes(record); if (stem === "mode_experiments") return scoreModeExperiment(record); } return scoreExtraction(record); } // Build a complete ScoredRun. Caller supplies recorded_at + the // source file / line offset to populate provenance. export async function buildScoredRun( record: EvidenceRecord, source_file_relpath: string, line_offset: number, recorded_at: string, ): Promise { const out = scoreRecord(record); // Compute provenance.sig_hash over the EvidenceRecord (not raw source); // ScoredRun traces to the materialized evidence row, not the raw stream. const sig_hash = await canonicalSha256(record); return { schema_version: SCORED_RUN_SCHEMA_VERSION, evidence_run_id: record.run_id, evidence_task_id: record.task_id, category: out.category, reasons: out.reasons, scored_at: recorded_at, scorer_version: SCORER_VERSION, sub_scores: out.sub_scores, provenance: { source_file: source_file_relpath, line_offset, sig_hash, recorded_at, }, }; }