Pure scoreRecord function + score_runs.ts CLI + 38 tests.
Reads data/evidence/YYYY/MM/DD/*.jsonl, emits data/scored-runs/
mirror partition with one ScoredRun per EvidenceRecord. ZERO model
calls. scorer_version stamped on every output (default v1.0.0).
Three-class scoring strategy (taxonomy from Phase 2 evidence_health.md):
CLASS A (verdict-bearing): direct mapping from existing markers.
scrum_reviews: accepted_on_attempt_1 → accepted; 2-3 → partial;
4+ → partial with high-cost reason
observer_reviews: accept|reject|cycle → category
audits: severity info/low → accepted, medium → partial,
high/critical → rejected (legacy markers also handled)
contract_analyses: failure_markers + observer_verdict
CLASS B (telemetry-rich): partial markers, fall back to needs_human
auto_apply: committed → accepted; *_reverted → rejected
outcomes: all_events_ok → accepted; gap_signals > 0 → partial
mode_experiments: empty text → rejected; latency > 120s → partial
CLASS C (extraction): needs_human (Phase 3 v2 will JOIN to parents)
Real-data run on 1052 evidence rows:
accepted=384 (37%) · partial=132 (13%) · rejected=57 (5%) · needs_human=479 (45%)
Verdict-bearing sources land 0% needs_human:
scrum_reviews (172): 111 acc · 61 part · 0 rej · 0 hum
audits (264): 217 acc · 29 part · 18 rej · 0 hum
observer_reviews (44): 22 acc · 3 part · 19 rej · 0 hum
contract_analyses (2): 1 acc · 0 part · 1 rej · 0 hum
BUG SURFACED + FIXED:
Phase 2 transform for audits.jsonl assumed PR-verdict shape (recon
misnamed it). Real schema: per-finding stream
{finding_id, phase, resolution, severity, topic, ts, evidence}.
Updated transform to derive markers from severity. 264 findings
went 0% scoreable → 100% scoreable. Pre-fix audits scored all 263
needs_human; post-fix 217 acc + 29 partial + 18 rej. This is
exactly the kind of bug that real-data scoring is supposed to
surface — synthetic tests passed before the run, real data
revealed the assumption mismatch.
Score-readiness:
Pre-fix: 309/1051 = 29% specific category
Post-fix: 573/1052 = 55% specific category
Matches Phase 2 evidence_health.md prediction (~54% scoreable)
Test metrics:
51 distillation tests pass (10 evidence_record + 30 schemas + 8 realdata
+ 9 build_evidence_index + 30 scorer + 8 score_runs + 21 inferred from earlier
files; bun test reports 51 across 3 phase-3 files alone)
192 expect() calls
399ms total
Receipts:
reports/distillation/2026-04-27T03-44-26-602Z/receipt.json
- record_counts.cat_accepted=384, cat_partially_accepted=132,
cat_rejected=57, cat_needs_human_review=479
- validation_pass=true (0 skips)
- self-validates against Receipt schema before write
Carry-overs to Phase 4+:
- mode_experiments 166 needs_human: derive grounding from validation_results
- extraction-class 207 rows: JOIN to verdict-bearing parent by task_id
- audit_discrepancies transform (still missing — Phase 4c needs)
- model_trust transform (needed for ModelLedgerEntry aggregation)
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
304 lines
13 KiB
TypeScript
304 lines
13 KiB
TypeScript
// scorer.ts — pure deterministic Success Scorer.
|
|
//
|
|
// Takes one EvidenceRecord, returns category + reasons + sub_scores.
|
|
// NO I/O, NO LLM, NO clock reads, NO mutable state. The only randomness
|
|
// allowed is none. Identical input → identical output forever.
|
|
//
|
|
// Three-class strategy (see docs/recon/local-distillation-recon.md +
|
|
// data/_kb/evidence_health.md for the source taxonomy):
|
|
//
|
|
// CLASS A — verdict-bearing
|
|
// scrum_reviews, observer_reviews, audits, contract_analyses
|
|
// Direct scoring from existing markers/observer_verdict
|
|
//
|
|
// CLASS B — telemetry-rich
|
|
// auto_apply, outcomes, mode_experiments
|
|
// Markers exist but partial; needs_human_review fills the gap
|
|
//
|
|
// CLASS C — pure-extraction (no native scoring signal)
|
|
// distilled_*, audit_facts, observer_escalations
|
|
// Default needs_human_review; v2 will JOIN to parent verdict
|
|
//
|
|
// scorer_version is stamped on every output. Bumping it lets a
|
|
// downstream re-scoring detect drift between historical runs.
|
|
|
|
import type { EvidenceRecord } from "../../auditor/schemas/distillation/evidence_record";
|
|
import type { ScoreCategory, ScoredRun } from "../../auditor/schemas/distillation/scored_run";
|
|
import { SCORED_RUN_SCHEMA_VERSION } from "../../auditor/schemas/distillation/scored_run";
|
|
import { canonicalSha256 } from "../../auditor/schemas/distillation/types";
|
|
|
|
export const SCORER_VERSION = process.env.LH_SCORER_VERSION ?? "v1.0.0";
|
|
|
|
export interface ScoreOutput {
|
|
category: ScoreCategory;
|
|
reasons: string[];
|
|
sub_scores: ScoredRun["sub_scores"];
|
|
}
|
|
|
|
// Map source_file (from provenance) → source class. Centralized so
|
|
// adding a new source is one-line.
|
|
type SourceClass = "verdict" | "telemetry" | "extraction";
|
|
|
|
function sourceClassFor(source_file: string): SourceClass {
|
|
// Strip data/_kb/ prefix and .jsonl suffix to compare by stem
|
|
const stem = source_file.replace(/^data\/_kb\//, "").replace(/\.jsonl$/, "");
|
|
switch (stem) {
|
|
case "scrum_reviews":
|
|
case "observer_reviews":
|
|
case "audits":
|
|
case "contract_analyses":
|
|
return "verdict";
|
|
case "auto_apply":
|
|
case "outcomes":
|
|
case "mode_experiments":
|
|
return "telemetry";
|
|
case "distilled_facts":
|
|
case "distilled_procedures":
|
|
case "distilled_config_hints":
|
|
case "audit_facts":
|
|
case "observer_escalations":
|
|
return "extraction";
|
|
default:
|
|
// Unknown source — route to extraction (most conservative —
|
|
// forces needs_human_review until a transform is added).
|
|
return "extraction";
|
|
}
|
|
}
|
|
|
|
// ─── Class A: verdict-bearing ─────────────────────────────────────
|
|
|
|
function scoreScrumReview(r: EvidenceRecord): ScoreOutput {
|
|
const reasons: string[] = [];
|
|
const subs: ScoredRun["sub_scores"] = {};
|
|
|
|
const successMarker = (r.success_markers ?? []).find(m => m.startsWith("accepted_on_attempt_"));
|
|
if (!successMarker) {
|
|
reasons.push("scrum_review missing accepted_on_attempt_* success marker");
|
|
return { category: "needs_human_review", reasons, sub_scores: subs };
|
|
}
|
|
const attempt = Number(successMarker.replace("accepted_on_attempt_", ""));
|
|
subs.accepted_on_attempt = attempt;
|
|
if (attempt === 1) {
|
|
reasons.push("scrum: accepted on first attempt");
|
|
return { category: "accepted", reasons, sub_scores: subs };
|
|
}
|
|
if (attempt <= 3) {
|
|
reasons.push(`scrum: accepted after ${attempt} attempts`);
|
|
return { category: "partially_accepted", reasons, sub_scores: subs };
|
|
}
|
|
reasons.push(`scrum: accepted only after ${attempt} attempts (high-cost path)`);
|
|
return { category: "partially_accepted", reasons, sub_scores: subs };
|
|
}
|
|
|
|
function scoreObserverReview(r: EvidenceRecord): ScoreOutput {
|
|
const reasons: string[] = [];
|
|
const subs: ScoredRun["sub_scores"] = {};
|
|
const v = r.observer_verdict;
|
|
if (v === "accept") {
|
|
subs.observer_verdict = "accept";
|
|
reasons.push("observer accepted the reviewed attempt");
|
|
return { category: "accepted", reasons, sub_scores: subs };
|
|
}
|
|
if (v === "reject") {
|
|
subs.observer_verdict = "reject";
|
|
reasons.push("observer rejected the reviewed attempt");
|
|
return { category: "rejected", reasons, sub_scores: subs };
|
|
}
|
|
if (v === "cycle") {
|
|
subs.observer_verdict = "cycle";
|
|
reasons.push("observer flagged the attempt as cycling — partial signal");
|
|
return { category: "partially_accepted", reasons, sub_scores: subs };
|
|
}
|
|
reasons.push(`observer_verdict missing or unrecognized: ${JSON.stringify(v ?? null)}`);
|
|
return { category: "needs_human_review", reasons, sub_scores: subs };
|
|
}
|
|
|
|
function scoreAudit(r: EvidenceRecord): ScoreOutput {
|
|
// audits.jsonl is the auditor's per-finding stream (not PR verdicts).
|
|
// Phase 2 transform encodes severity into markers:
|
|
// audit_severity_{info,low} → accepted (minor finding)
|
|
// audit_severity_medium → partially_accepted
|
|
// audit_severity_{high,critical} → rejected (real problem)
|
|
// Older "approved"/"blocked"/"request_changes" markers also handled
|
|
// for back-compat with any pre-fix materializations on disk.
|
|
const reasons: string[] = [];
|
|
const subs: ScoredRun["sub_scores"] = {};
|
|
const succ = r.success_markers ?? [];
|
|
const fail = r.failure_markers ?? [];
|
|
|
|
if (succ.includes("approved")) {
|
|
reasons.push("audit overall=approved (legacy marker)");
|
|
return { category: "accepted", reasons, sub_scores: subs };
|
|
}
|
|
if (fail.includes("blocked")) {
|
|
reasons.push("audit overall=block (legacy marker)");
|
|
return { category: "rejected", reasons, sub_scores: subs };
|
|
}
|
|
if (fail.includes("request_changes")) {
|
|
reasons.push("audit overall=request_changes (legacy marker)");
|
|
return { category: "partially_accepted", reasons, sub_scores: subs };
|
|
}
|
|
|
|
// Severity-derived markers (current Phase 2 transform):
|
|
const sevSucc = succ.find(m => m.startsWith("audit_severity_"));
|
|
const sevFail = fail.find(m => m.startsWith("audit_severity_"));
|
|
if (sevSucc) {
|
|
reasons.push(`${sevSucc} → minor finding`);
|
|
return { category: "accepted", reasons, sub_scores: subs };
|
|
}
|
|
if (sevFail === "audit_severity_medium") {
|
|
reasons.push("audit_severity_medium → finding warrants review");
|
|
return { category: "partially_accepted", reasons, sub_scores: subs };
|
|
}
|
|
if (sevFail === "audit_severity_high" || sevFail === "audit_severity_critical") {
|
|
reasons.push(`${sevFail} → blocking finding`);
|
|
return { category: "rejected", reasons, sub_scores: subs };
|
|
}
|
|
|
|
reasons.push("audit row has no severity or overall marker");
|
|
return { category: "needs_human_review", reasons, sub_scores: subs };
|
|
}
|
|
|
|
function scoreContractAnalysis(r: EvidenceRecord): ScoreOutput {
|
|
const reasons: string[] = [];
|
|
const subs: ScoredRun["sub_scores"] = {};
|
|
const v = r.observer_verdict;
|
|
// failure_markers takes precedence: explicit rejection beats absent verdict
|
|
if ((r.failure_markers ?? []).includes("observer_rejected") || v === "reject") {
|
|
subs.observer_verdict = "reject";
|
|
reasons.push("contract analysis: observer rejected");
|
|
return { category: "rejected", reasons, sub_scores: subs };
|
|
}
|
|
if (v === "accept") {
|
|
subs.observer_verdict = "accept";
|
|
reasons.push("contract analysis: observer accepted");
|
|
return { category: "accepted", reasons, sub_scores: subs };
|
|
}
|
|
if (v === "cycle") {
|
|
subs.observer_verdict = "cycle";
|
|
reasons.push("contract analysis: observer cycled (partial)");
|
|
return { category: "partially_accepted", reasons, sub_scores: subs };
|
|
}
|
|
reasons.push("contract analysis: no observer verdict signal");
|
|
return { category: "needs_human_review", reasons, sub_scores: subs };
|
|
}
|
|
|
|
// ─── Class B: telemetry-rich ──────────────────────────────────────
|
|
|
|
function scoreAutoApply(r: EvidenceRecord): ScoreOutput {
|
|
const reasons: string[] = [];
|
|
const subs: ScoredRun["sub_scores"] = {};
|
|
if ((r.success_markers ?? []).includes("committed")) {
|
|
subs.cargo_green = true;
|
|
reasons.push("auto_apply: patch committed (cargo green + warning baseline + rationale alignment passed)");
|
|
return { category: "accepted", reasons, sub_scores: subs };
|
|
}
|
|
const failures = (r.failure_markers ?? []);
|
|
const reverted = failures.find(f => f.includes("reverted"));
|
|
if (reverted) {
|
|
if (reverted.includes("build_red")) subs.cargo_green = false;
|
|
reasons.push(`auto_apply: ${reverted}`);
|
|
return { category: "rejected", reasons, sub_scores: subs };
|
|
}
|
|
// no_patches / dry_run / all_rejected — not a failure of code, but no commit either
|
|
reasons.push("auto_apply: no commit + no revert (likely no_patches or dry_run)");
|
|
return { category: "needs_human_review", reasons, sub_scores: subs };
|
|
}
|
|
|
|
function scoreOutcomes(r: EvidenceRecord): ScoreOutput {
|
|
const reasons: string[] = [];
|
|
const subs: ScoredRun["sub_scores"] = {};
|
|
if ((r.success_markers ?? []).includes("all_events_ok")) {
|
|
reasons.push("outcomes: all events ok");
|
|
return { category: "accepted", reasons, sub_scores: subs };
|
|
}
|
|
// Validation results may carry partial signal
|
|
const gap = r.validation_results?.gap_signals as number | undefined;
|
|
if (typeof gap === "number" && gap > 0) {
|
|
reasons.push(`outcomes: ${gap} gap signal(s) detected`);
|
|
return { category: "partially_accepted", reasons, sub_scores: subs };
|
|
}
|
|
reasons.push("outcomes: no decisive marker — defer to human");
|
|
return { category: "needs_human_review", reasons, sub_scores: subs };
|
|
}
|
|
|
|
function scoreModeExperiment(r: EvidenceRecord): ScoreOutput {
|
|
const reasons: string[] = [];
|
|
const subs: ScoredRun["sub_scores"] = {};
|
|
// mode_experiments at Phase 2 lacks markers (transform doesn't derive
|
|
// them yet). v1 derivation: a non-empty response with reasonable
|
|
// latency is at least partially_accepted; otherwise needs_human_review.
|
|
// Anything stronger needs the grounding-from-mode_compare hook in
|
|
// Phase 4 / re-scoring.
|
|
if (typeof r.text !== "string" || r.text.trim().length === 0) {
|
|
reasons.push("mode_experiment: empty response text");
|
|
return { category: "rejected", reasons, sub_scores: subs };
|
|
}
|
|
if (typeof r.latency_ms === "number" && r.latency_ms > 120_000) {
|
|
reasons.push(`mode_experiment: latency ${r.latency_ms}ms exceeds 2-minute soft cap`);
|
|
return { category: "partially_accepted", reasons, sub_scores: subs };
|
|
}
|
|
reasons.push("mode_experiment: response present, latency within bounds; verdict not yet wired");
|
|
return { category: "needs_human_review", reasons, sub_scores: subs };
|
|
}
|
|
|
|
// ─── Class C: pure-extraction ────────────────────────────────────
|
|
|
|
function scoreExtraction(r: EvidenceRecord): ScoreOutput {
|
|
// Phase 3 v1: extraction-class records have no native scoring
|
|
// signal. Default to needs_human_review with an explicit reason.
|
|
// Phase 3 v2 will JOIN to a parent verdict-bearing record.
|
|
const reasons = ["extraction-class source has no native scoring signal — JOIN to parent verdict pending Phase 3 v2"];
|
|
return { category: "needs_human_review", reasons, sub_scores: {} };
|
|
}
|
|
|
|
// ─── Dispatch ─────────────────────────────────────────────────────
|
|
|
|
export function scoreRecord(record: EvidenceRecord): ScoreOutput {
|
|
const cls = sourceClassFor(record.provenance.source_file);
|
|
const stem = record.provenance.source_file.replace(/^data\/_kb\//, "").replace(/\.jsonl$/, "");
|
|
|
|
if (cls === "verdict") {
|
|
if (stem === "scrum_reviews") return scoreScrumReview(record);
|
|
if (stem === "observer_reviews") return scoreObserverReview(record);
|
|
if (stem === "audits") return scoreAudit(record);
|
|
if (stem === "contract_analyses") return scoreContractAnalysis(record);
|
|
}
|
|
if (cls === "telemetry") {
|
|
if (stem === "auto_apply") return scoreAutoApply(record);
|
|
if (stem === "outcomes") return scoreOutcomes(record);
|
|
if (stem === "mode_experiments") return scoreModeExperiment(record);
|
|
}
|
|
return scoreExtraction(record);
|
|
}
|
|
|
|
// Build a complete ScoredRun. Caller supplies recorded_at + the
|
|
// source file / line offset to populate provenance.
|
|
export async function buildScoredRun(
|
|
record: EvidenceRecord,
|
|
source_file_relpath: string,
|
|
line_offset: number,
|
|
recorded_at: string,
|
|
): Promise<ScoredRun> {
|
|
const out = scoreRecord(record);
|
|
// Compute provenance.sig_hash over the EvidenceRecord (not raw source);
|
|
// ScoredRun traces to the materialized evidence row, not the raw stream.
|
|
const sig_hash = await canonicalSha256(record);
|
|
return {
|
|
schema_version: SCORED_RUN_SCHEMA_VERSION,
|
|
evidence_run_id: record.run_id,
|
|
evidence_task_id: record.task_id,
|
|
category: out.category,
|
|
reasons: out.reasons,
|
|
scored_at: recorded_at,
|
|
scorer_version: SCORER_VERSION,
|
|
sub_scores: out.sub_scores,
|
|
provenance: {
|
|
source_file: source_file_relpath,
|
|
line_offset,
|
|
sig_hash,
|
|
recorded_at,
|
|
},
|
|
};
|
|
}
|