Kimi For Coding (api.kimi.com, kimi-for-coding) ran a forensic audit on
distillation v1.0.0 with full file content. 7/7 flags verified real on
grep. Substrate now matches what v1.0.0 claimed: deterministic, no
schema bypasses, Rust tests compile.
Fixes:
- mode.rs:1035,1042 matrix_corpus Some/None -> vec![..]/vec![]; cargo
check --tests now compiles (was silently broken;
only bun tests were running)
- scorer.ts:30 SCORER_VERSION env override removed - identical
input now produces identical version stamp, not
env-dependent drift
- transforms.ts:181 auto_apply wall-clock fallback (new Date()) ->
deterministic recorded_at fallback
- replay.ts:378 recorded_run_id Date.now() -> sha256(recorded_at);
replay rows now reproducible given recorded_at
- receipts.ts:454,495 input_hash_match hardcoded true was misleading
telemetry; bumped DRIFT_REPORT_SCHEMA_VERSION 1->2,
field is now boolean|null with honest null when
not computed at this layer
- score_runs.ts:89-100,159 dedup keyed only on sig_hash made
scorer-version bumps invisible. Composite
sig_hash:scorer_version forces re-scoring
- export_sft.ts:126 (ev as any).contractor bypass emitted "<contractor>"
placeholder for every contract_analyses SFT row.
Added typed EvidenceRecord.metadata bucket;
transforms.ts populates metadata.contractor;
exporter reads typed value
Verification (all green):
cargo check -p gateway --tests compiles
bun test tests/distillation/ 145 pass / 0 fail
bun acceptance 22/22 invariants
bun audit-full 16/16 required checks
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
308 lines
13 KiB
TypeScript
308 lines
13 KiB
TypeScript
// scorer.ts — pure deterministic Success Scorer.
|
|
//
|
|
// Takes one EvidenceRecord, returns category + reasons + sub_scores.
|
|
// NO I/O, NO LLM, NO clock reads, NO mutable state. The only randomness
|
|
// allowed is none. Identical input → identical output forever.
|
|
//
|
|
// Three-class strategy (see docs/recon/local-distillation-recon.md +
|
|
// data/_kb/evidence_health.md for the source taxonomy):
|
|
//
|
|
// CLASS A — verdict-bearing
|
|
// scrum_reviews, observer_reviews, audits, contract_analyses
|
|
// Direct scoring from existing markers/observer_verdict
|
|
//
|
|
// CLASS B — telemetry-rich
|
|
// auto_apply, outcomes, mode_experiments
|
|
// Markers exist but partial; needs_human_review fills the gap
|
|
//
|
|
// CLASS C — pure-extraction (no native scoring signal)
|
|
// distilled_*, audit_facts, observer_escalations
|
|
// Default needs_human_review; v2 will JOIN to parent verdict
|
|
//
|
|
// scorer_version is stamped on every output. Bumping it lets a
|
|
// downstream re-scoring detect drift between historical runs.
|
|
|
|
import type { EvidenceRecord } from "../../auditor/schemas/distillation/evidence_record";
|
|
import type { ScoreCategory, ScoredRun } from "../../auditor/schemas/distillation/scored_run";
|
|
import { SCORED_RUN_SCHEMA_VERSION } from "../../auditor/schemas/distillation/scored_run";
|
|
import { canonicalSha256 } from "../../auditor/schemas/distillation/types";
|
|
|
|
// Hardcoded — the deterministic-output contract requires this. Bump the
|
|
// literal in the same commit as any scoring-rule change so the version
|
|
// stamp moves atomically with logic. Env override removed 2026-04-27
|
|
// after Kimi audit flagged identical-input-different-version drift.
|
|
export const SCORER_VERSION = "v1.0.0";
|
|
|
|
export interface ScoreOutput {
|
|
category: ScoreCategory;
|
|
reasons: string[];
|
|
sub_scores: ScoredRun["sub_scores"];
|
|
}
|
|
|
|
// Map source_file (from provenance) → source class. Centralized so
|
|
// adding a new source is one-line.
|
|
type SourceClass = "verdict" | "telemetry" | "extraction";
|
|
|
|
function sourceClassFor(source_file: string): SourceClass {
|
|
// Strip data/_kb/ prefix and .jsonl suffix to compare by stem
|
|
const stem = source_file.replace(/^data\/_kb\//, "").replace(/\.jsonl$/, "");
|
|
switch (stem) {
|
|
case "scrum_reviews":
|
|
case "observer_reviews":
|
|
case "audits":
|
|
case "contract_analyses":
|
|
return "verdict";
|
|
case "auto_apply":
|
|
case "outcomes":
|
|
case "mode_experiments":
|
|
return "telemetry";
|
|
case "distilled_facts":
|
|
case "distilled_procedures":
|
|
case "distilled_config_hints":
|
|
case "audit_facts":
|
|
case "observer_escalations":
|
|
return "extraction";
|
|
default:
|
|
// Unknown source — route to extraction (most conservative —
|
|
// forces needs_human_review until a transform is added).
|
|
return "extraction";
|
|
}
|
|
}
|
|
|
|
// ─── Class A: verdict-bearing ─────────────────────────────────────
|
|
|
|
function scoreScrumReview(r: EvidenceRecord): ScoreOutput {
|
|
const reasons: string[] = [];
|
|
const subs: ScoredRun["sub_scores"] = {};
|
|
|
|
const successMarker = (r.success_markers ?? []).find(m => m.startsWith("accepted_on_attempt_"));
|
|
if (!successMarker) {
|
|
reasons.push("scrum_review missing accepted_on_attempt_* success marker");
|
|
return { category: "needs_human_review", reasons, sub_scores: subs };
|
|
}
|
|
const attempt = Number(successMarker.replace("accepted_on_attempt_", ""));
|
|
subs.accepted_on_attempt = attempt;
|
|
if (attempt === 1) {
|
|
reasons.push("scrum: accepted on first attempt");
|
|
return { category: "accepted", reasons, sub_scores: subs };
|
|
}
|
|
if (attempt <= 3) {
|
|
reasons.push(`scrum: accepted after ${attempt} attempts`);
|
|
return { category: "partially_accepted", reasons, sub_scores: subs };
|
|
}
|
|
reasons.push(`scrum: accepted only after ${attempt} attempts (high-cost path)`);
|
|
return { category: "partially_accepted", reasons, sub_scores: subs };
|
|
}
|
|
|
|
function scoreObserverReview(r: EvidenceRecord): ScoreOutput {
|
|
const reasons: string[] = [];
|
|
const subs: ScoredRun["sub_scores"] = {};
|
|
const v = r.observer_verdict;
|
|
if (v === "accept") {
|
|
subs.observer_verdict = "accept";
|
|
reasons.push("observer accepted the reviewed attempt");
|
|
return { category: "accepted", reasons, sub_scores: subs };
|
|
}
|
|
if (v === "reject") {
|
|
subs.observer_verdict = "reject";
|
|
reasons.push("observer rejected the reviewed attempt");
|
|
return { category: "rejected", reasons, sub_scores: subs };
|
|
}
|
|
if (v === "cycle") {
|
|
subs.observer_verdict = "cycle";
|
|
reasons.push("observer flagged the attempt as cycling — partial signal");
|
|
return { category: "partially_accepted", reasons, sub_scores: subs };
|
|
}
|
|
reasons.push(`observer_verdict missing or unrecognized: ${JSON.stringify(v ?? null)}`);
|
|
return { category: "needs_human_review", reasons, sub_scores: subs };
|
|
}
|
|
|
|
function scoreAudit(r: EvidenceRecord): ScoreOutput {
|
|
// audits.jsonl is the auditor's per-finding stream (not PR verdicts).
|
|
// Phase 2 transform encodes severity into markers:
|
|
// audit_severity_{info,low} → accepted (minor finding)
|
|
// audit_severity_medium → partially_accepted
|
|
// audit_severity_{high,critical} → rejected (real problem)
|
|
// Older "approved"/"blocked"/"request_changes" markers also handled
|
|
// for back-compat with any pre-fix materializations on disk.
|
|
const reasons: string[] = [];
|
|
const subs: ScoredRun["sub_scores"] = {};
|
|
const succ = r.success_markers ?? [];
|
|
const fail = r.failure_markers ?? [];
|
|
|
|
if (succ.includes("approved")) {
|
|
reasons.push("audit overall=approved (legacy marker)");
|
|
return { category: "accepted", reasons, sub_scores: subs };
|
|
}
|
|
if (fail.includes("blocked")) {
|
|
reasons.push("audit overall=block (legacy marker)");
|
|
return { category: "rejected", reasons, sub_scores: subs };
|
|
}
|
|
if (fail.includes("request_changes")) {
|
|
reasons.push("audit overall=request_changes (legacy marker)");
|
|
return { category: "partially_accepted", reasons, sub_scores: subs };
|
|
}
|
|
|
|
// Severity-derived markers (current Phase 2 transform):
|
|
const sevSucc = succ.find(m => m.startsWith("audit_severity_"));
|
|
const sevFail = fail.find(m => m.startsWith("audit_severity_"));
|
|
if (sevSucc) {
|
|
reasons.push(`${sevSucc} → minor finding`);
|
|
return { category: "accepted", reasons, sub_scores: subs };
|
|
}
|
|
if (sevFail === "audit_severity_medium") {
|
|
reasons.push("audit_severity_medium → finding warrants review");
|
|
return { category: "partially_accepted", reasons, sub_scores: subs };
|
|
}
|
|
if (sevFail === "audit_severity_high" || sevFail === "audit_severity_critical") {
|
|
reasons.push(`${sevFail} → blocking finding`);
|
|
return { category: "rejected", reasons, sub_scores: subs };
|
|
}
|
|
|
|
reasons.push("audit row has no severity or overall marker");
|
|
return { category: "needs_human_review", reasons, sub_scores: subs };
|
|
}
|
|
|
|
function scoreContractAnalysis(r: EvidenceRecord): ScoreOutput {
|
|
const reasons: string[] = [];
|
|
const subs: ScoredRun["sub_scores"] = {};
|
|
const v = r.observer_verdict;
|
|
// failure_markers takes precedence: explicit rejection beats absent verdict
|
|
if ((r.failure_markers ?? []).includes("observer_rejected") || v === "reject") {
|
|
subs.observer_verdict = "reject";
|
|
reasons.push("contract analysis: observer rejected");
|
|
return { category: "rejected", reasons, sub_scores: subs };
|
|
}
|
|
if (v === "accept") {
|
|
subs.observer_verdict = "accept";
|
|
reasons.push("contract analysis: observer accepted");
|
|
return { category: "accepted", reasons, sub_scores: subs };
|
|
}
|
|
if (v === "cycle") {
|
|
subs.observer_verdict = "cycle";
|
|
reasons.push("contract analysis: observer cycled (partial)");
|
|
return { category: "partially_accepted", reasons, sub_scores: subs };
|
|
}
|
|
reasons.push("contract analysis: no observer verdict signal");
|
|
return { category: "needs_human_review", reasons, sub_scores: subs };
|
|
}
|
|
|
|
// ─── Class B: telemetry-rich ──────────────────────────────────────
|
|
|
|
function scoreAutoApply(r: EvidenceRecord): ScoreOutput {
|
|
const reasons: string[] = [];
|
|
const subs: ScoredRun["sub_scores"] = {};
|
|
if ((r.success_markers ?? []).includes("committed")) {
|
|
subs.cargo_green = true;
|
|
reasons.push("auto_apply: patch committed (cargo green + warning baseline + rationale alignment passed)");
|
|
return { category: "accepted", reasons, sub_scores: subs };
|
|
}
|
|
const failures = (r.failure_markers ?? []);
|
|
const reverted = failures.find(f => f.includes("reverted"));
|
|
if (reverted) {
|
|
if (reverted.includes("build_red")) subs.cargo_green = false;
|
|
reasons.push(`auto_apply: ${reverted}`);
|
|
return { category: "rejected", reasons, sub_scores: subs };
|
|
}
|
|
// no_patches / dry_run / all_rejected — not a failure of code, but no commit either
|
|
reasons.push("auto_apply: no commit + no revert (likely no_patches or dry_run)");
|
|
return { category: "needs_human_review", reasons, sub_scores: subs };
|
|
}
|
|
|
|
function scoreOutcomes(r: EvidenceRecord): ScoreOutput {
|
|
const reasons: string[] = [];
|
|
const subs: ScoredRun["sub_scores"] = {};
|
|
if ((r.success_markers ?? []).includes("all_events_ok")) {
|
|
reasons.push("outcomes: all events ok");
|
|
return { category: "accepted", reasons, sub_scores: subs };
|
|
}
|
|
// Validation results may carry partial signal
|
|
const gap = r.validation_results?.gap_signals as number | undefined;
|
|
if (typeof gap === "number" && gap > 0) {
|
|
reasons.push(`outcomes: ${gap} gap signal(s) detected`);
|
|
return { category: "partially_accepted", reasons, sub_scores: subs };
|
|
}
|
|
reasons.push("outcomes: no decisive marker — defer to human");
|
|
return { category: "needs_human_review", reasons, sub_scores: subs };
|
|
}
|
|
|
|
function scoreModeExperiment(r: EvidenceRecord): ScoreOutput {
|
|
const reasons: string[] = [];
|
|
const subs: ScoredRun["sub_scores"] = {};
|
|
// mode_experiments at Phase 2 lacks markers (transform doesn't derive
|
|
// them yet). v1 derivation: a non-empty response with reasonable
|
|
// latency is at least partially_accepted; otherwise needs_human_review.
|
|
// Anything stronger needs the grounding-from-mode_compare hook in
|
|
// Phase 4 / re-scoring.
|
|
if (typeof r.text !== "string" || r.text.trim().length === 0) {
|
|
reasons.push("mode_experiment: empty response text");
|
|
return { category: "rejected", reasons, sub_scores: subs };
|
|
}
|
|
if (typeof r.latency_ms === "number" && r.latency_ms > 120_000) {
|
|
reasons.push(`mode_experiment: latency ${r.latency_ms}ms exceeds 2-minute soft cap`);
|
|
return { category: "partially_accepted", reasons, sub_scores: subs };
|
|
}
|
|
reasons.push("mode_experiment: response present, latency within bounds; verdict not yet wired");
|
|
return { category: "needs_human_review", reasons, sub_scores: subs };
|
|
}
|
|
|
|
// ─── Class C: pure-extraction ────────────────────────────────────
|
|
|
|
function scoreExtraction(r: EvidenceRecord): ScoreOutput {
|
|
// Phase 3 v1: extraction-class records have no native scoring
|
|
// signal. Default to needs_human_review with an explicit reason.
|
|
// Phase 3 v2 will JOIN to a parent verdict-bearing record.
|
|
const reasons = ["extraction-class source has no native scoring signal — JOIN to parent verdict pending Phase 3 v2"];
|
|
return { category: "needs_human_review", reasons, sub_scores: {} };
|
|
}
|
|
|
|
// ─── Dispatch ─────────────────────────────────────────────────────
|
|
|
|
export function scoreRecord(record: EvidenceRecord): ScoreOutput {
|
|
const cls = sourceClassFor(record.provenance.source_file);
|
|
const stem = record.provenance.source_file.replace(/^data\/_kb\//, "").replace(/\.jsonl$/, "");
|
|
|
|
if (cls === "verdict") {
|
|
if (stem === "scrum_reviews") return scoreScrumReview(record);
|
|
if (stem === "observer_reviews") return scoreObserverReview(record);
|
|
if (stem === "audits") return scoreAudit(record);
|
|
if (stem === "contract_analyses") return scoreContractAnalysis(record);
|
|
}
|
|
if (cls === "telemetry") {
|
|
if (stem === "auto_apply") return scoreAutoApply(record);
|
|
if (stem === "outcomes") return scoreOutcomes(record);
|
|
if (stem === "mode_experiments") return scoreModeExperiment(record);
|
|
}
|
|
return scoreExtraction(record);
|
|
}
|
|
|
|
// Build a complete ScoredRun. Caller supplies recorded_at + the
|
|
// source file / line offset to populate provenance.
|
|
export async function buildScoredRun(
|
|
record: EvidenceRecord,
|
|
source_file_relpath: string,
|
|
line_offset: number,
|
|
recorded_at: string,
|
|
): Promise<ScoredRun> {
|
|
const out = scoreRecord(record);
|
|
// Compute provenance.sig_hash over the EvidenceRecord (not raw source);
|
|
// ScoredRun traces to the materialized evidence row, not the raw stream.
|
|
const sig_hash = await canonicalSha256(record);
|
|
return {
|
|
schema_version: SCORED_RUN_SCHEMA_VERSION,
|
|
evidence_run_id: record.run_id,
|
|
evidence_task_id: record.task_id,
|
|
category: out.category,
|
|
reasons: out.reasons,
|
|
scored_at: recorded_at,
|
|
scorer_version: SCORER_VERSION,
|
|
sub_scores: out.sub_scores,
|
|
provenance: {
|
|
source_file: source_file_relpath,
|
|
line_offset,
|
|
sig_hash,
|
|
recorded_at,
|
|
},
|
|
};
|
|
}
|