diff --git a/scripts/distillation/score_runs.ts b/scripts/distillation/score_runs.ts new file mode 100644 index 0000000..edd8bfa --- /dev/null +++ b/scripts/distillation/score_runs.ts @@ -0,0 +1,305 @@ +// score_runs.ts — CLI + I/O around the pure scoreRecord function. +// Reads data/evidence/YYYY/MM/DD/*.jsonl, writes scored-runs at the +// matching partition. Mirrors build_evidence_index.ts conventions: +// idempotent, schema-gated, receipt-emitting. + +import { existsSync, readFileSync, mkdirSync, writeFileSync, readdirSync, statSync, appendFileSync } from "node:fs"; +import { resolve, dirname } from "node:path"; +import { spawnSync } from "node:child_process"; + +import { buildScoredRun } from "./scorer"; +import { validateEvidenceRecord, type EvidenceRecord } from "../../auditor/schemas/distillation/evidence_record"; +import { validateScoredRun } from "../../auditor/schemas/distillation/scored_run"; +import { RECEIPT_SCHEMA_VERSION, validateReceipt, type Receipt, type FileReference } from "../../auditor/schemas/distillation/receipt"; +import { canonicalSha256 } from "../../auditor/schemas/distillation/types"; + +const DEFAULT_ROOT = process.env.LH_DISTILL_ROOT ?? "/home/profit/lakehouse"; + +export interface ScoreOptions { + root: string; + recorded_at: string; + dry_run?: boolean; +} + +export interface ScoreSourceResult { + source_file: string; + rows_read: number; + rows_written: number; + rows_skipped: number; + rows_deduped: number; + by_category: Record; + output_files: string[]; +} + +export interface ScoreResult { + sources: ScoreSourceResult[]; + totals: { + rows_read: number; + rows_written: number; + rows_skipped: number; + rows_deduped: number; + by_category: Record; + }; + receipt: Receipt; + receipt_path: string; + scored_dir: string; + skips_path: string; +} + +function listEvidenceFiles(evidence_root: string): string[] { + const out: string[] = []; + if (!existsSync(evidence_root)) return out; + for (const yyyy of readdirSync(evidence_root).sort()) { + const yp = resolve(evidence_root, yyyy); + if (!statSync(yp).isDirectory()) continue; + for (const mm of readdirSync(yp).sort()) { + const mp = resolve(yp, mm); + if (!statSync(mp).isDirectory()) continue; + for (const dd of readdirSync(mp).sort()) { + const dp = resolve(mp, dd); + if (!statSync(dp).isDirectory()) continue; + for (const f of readdirSync(dp)) { + if (f.endsWith(".jsonl")) out.push(resolve(dp, f)); + } + } + } + } + return out; +} + +function sha256OfFile(path: string): string { + const h = new Bun.CryptoHasher("sha256"); + h.update(readFileSync(path)); + return h.digest("hex"); +} + +function gitSha(root: string): string { + const r = spawnSync("git", ["-C", root, "rev-parse", "HEAD"], { encoding: "utf8" }); + return r.status === 0 ? r.stdout.trim() : "0".repeat(40); +} +function gitBranch(root: string): string | undefined { + const r = spawnSync("git", ["-C", root, "rev-parse", "--abbrev-ref", "HEAD"], { encoding: "utf8" }); + return r.status === 0 ? r.stdout.trim() : undefined; +} +function gitDirty(root: string): boolean { + const r = spawnSync("git", ["-C", root, "status", "--porcelain"], { encoding: "utf8" }); + return r.status === 0 && r.stdout.trim().length > 0; +} + +function loadSeenHashes(out_path: string): Set { + const seen = new Set(); + if (!existsSync(out_path)) return seen; + for (const line of readFileSync(out_path, "utf8").split("\n")) { + if (!line) continue; + try { + const row = JSON.parse(line); + if (row?.provenance?.sig_hash) seen.add(row.provenance.sig_hash); + } catch { /* malformed — ignore */ } + } + return seen; +} + +async function processEvidenceFile( + ev_path: string, + opts: ScoreOptions, + scored_dir: string, + skips_path: string, +): Promise { + // Output mirrors the input partition (YYYY/MM/DD/.jsonl) + const partition = ev_path.match(/data\/evidence\/(\d{4}\/\d{2}\/\d{2})\//)?.[1] ?? "unpartitioned"; + const stem = ev_path.split("/").pop()!.replace(/\.jsonl$/, ""); + const out_path = resolve(scored_dir, partition, `${stem}.jsonl`); + const out_relpath = `data/scored-runs/${partition}/${stem}.jsonl`; + + const result: ScoreSourceResult = { + source_file: ev_path.replace(opts.root + "/", ""), + rows_read: 0, + rows_written: 0, + rows_skipped: 0, + rows_deduped: 0, + by_category: { accepted: 0, partially_accepted: 0, rejected: 0, needs_human_review: 0 }, + output_files: [], + }; + + if (!opts.dry_run) mkdirSync(dirname(out_path), { recursive: true }); + + const seen = loadSeenHashes(out_path); + const lines = readFileSync(ev_path, "utf8").split("\n").filter(Boolean); + const rowsToWrite: string[] = []; + const skipsToWrite: string[] = []; + + for (let i = 0; i < lines.length; i++) { + result.rows_read++; + let evRow: any; + try { evRow = JSON.parse(lines[i]); } + catch (e) { + result.rows_skipped++; + skipsToWrite.push(JSON.stringify({ + evidence_file: result.source_file, line: i, + errors: ["evidence not JSON: " + (e as Error).message.slice(0, 200)], + recorded_at: opts.recorded_at, + })); + continue; + } + + // Re-validate the evidence row before scoring — defensive; if a + // malformed row slipped past Phase 2 it shouldn't poison Phase 3. + const ev = validateEvidenceRecord(evRow); + if (!ev.valid) { + result.rows_skipped++; + skipsToWrite.push(JSON.stringify({ + evidence_file: result.source_file, line: i, + errors: ev.errors, + recorded_at: opts.recorded_at, + })); + continue; + } + + const scored = await buildScoredRun(ev.value as EvidenceRecord, out_relpath, i, opts.recorded_at); + if (seen.has(scored.provenance.sig_hash)) { + result.rows_deduped++; + continue; + } + seen.add(scored.provenance.sig_hash); + + const sv = validateScoredRun(scored); + if (!sv.valid) { + result.rows_skipped++; + skipsToWrite.push(JSON.stringify({ + evidence_file: result.source_file, line: i, + errors: sv.errors.map(e => "scored_run schema: " + e), + recorded_at: opts.recorded_at, + })); + continue; + } + + rowsToWrite.push(JSON.stringify(sv.value)); + result.rows_written++; + result.by_category[sv.value.category] = (result.by_category[sv.value.category] ?? 0) + 1; + } + + if (!opts.dry_run) { + if (rowsToWrite.length > 0) { + appendFileSync(out_path, rowsToWrite.join("\n") + "\n"); + result.output_files.push(out_path); + } + if (skipsToWrite.length > 0) { + mkdirSync(dirname(skips_path), { recursive: true }); + appendFileSync(skips_path, skipsToWrite.join("\n") + "\n"); + } + } + + return result; +} + +export async function scoreAll(opts: ScoreOptions): Promise { + const evidence_root = resolve(opts.root, "data/evidence"); + const scored_dir = resolve(opts.root, "data/scored-runs"); + const skips_path = resolve(opts.root, "data/_kb/scoring_skips.jsonl"); + const reports_dir = resolve(opts.root, "reports/distillation"); + + const started_ms = Date.now(); + const ev_files = listEvidenceFiles(evidence_root); + const sources: ScoreSourceResult[] = []; + + for (const ev of ev_files) { + sources.push(await processEvidenceFile(ev, opts, scored_dir, skips_path)); + } + + const totals = sources.reduce((acc, s) => ({ + rows_read: acc.rows_read + s.rows_read, + rows_written: acc.rows_written + s.rows_written, + rows_skipped: acc.rows_skipped + s.rows_skipped, + rows_deduped: acc.rows_deduped + s.rows_deduped, + by_category: { + accepted: (acc.by_category.accepted ?? 0) + (s.by_category.accepted ?? 0), + partially_accepted: (acc.by_category.partially_accepted ?? 0) + (s.by_category.partially_accepted ?? 0), + rejected: (acc.by_category.rejected ?? 0) + (s.by_category.rejected ?? 0), + needs_human_review: (acc.by_category.needs_human_review ?? 0) + (s.by_category.needs_human_review ?? 0), + }, + }), { rows_read: 0, rows_written: 0, rows_skipped: 0, rows_deduped: 0, by_category: {} as Record }); + + const ended_at = new Date().toISOString(); + const duration_ms = Date.now() - started_ms; + + const input_files: FileReference[] = ev_files.map(p => ({ + path: p.replace(opts.root + "/", ""), + sha256: sha256OfFile(p), + bytes: statSync(p).size, + })); + const output_files: FileReference[] = []; + for (const s of sources) { + for (const out_path of s.output_files) { + try { + output_files.push({ + path: out_path.replace(opts.root + "/", ""), + sha256: sha256OfFile(out_path), + bytes: statSync(out_path).size, + }); + } catch { /* dry-run path */ } + } + } + + const errors: string[] = []; + const warnings: string[] = []; + for (const s of sources) { + if (s.rows_skipped > 0) warnings.push(`${s.source_file}: ${s.rows_skipped} skipped`); + } + + const receipt: Receipt = { + schema_version: RECEIPT_SCHEMA_VERSION, + command: "bun run scripts/distillation/score_runs.ts" + (opts.dry_run ? " --dry-run" : ""), + git_sha: gitSha(opts.root), + git_branch: gitBranch(opts.root), + git_dirty: gitDirty(opts.root), + started_at: opts.recorded_at, + ended_at, + duration_ms, + input_files, + output_files, + record_counts: { + in: totals.rows_read, + out: totals.rows_written, + skipped: totals.rows_skipped, + deduped: totals.rows_deduped, + cat_accepted: totals.by_category.accepted ?? 0, + cat_partially_accepted: totals.by_category.partially_accepted ?? 0, + cat_rejected: totals.by_category.rejected ?? 0, + cat_needs_human_review: totals.by_category.needs_human_review ?? 0, + }, + validation_pass: totals.rows_skipped === 0, + errors, + warnings, + }; + const rv = validateReceipt(receipt); + if (!rv.valid) { + receipt.errors.push(...rv.errors.map(e => "receipt schema: " + e)); + receipt.validation_pass = false; + } + + const stamp = ended_at.replace(/[:.]/g, "-"); + const receipt_path = resolve(reports_dir, stamp, "receipt.json"); + if (!opts.dry_run) { + mkdirSync(dirname(receipt_path), { recursive: true }); + writeFileSync(receipt_path, JSON.stringify(receipt, null, 2) + "\n"); + } + + return { sources, totals, receipt, receipt_path, scored_dir, skips_path }; +} + +async function cli() { + const dry_run = process.argv.includes("--dry-run"); + const recorded_at = new Date().toISOString(); + const r = await scoreAll({ root: DEFAULT_ROOT, recorded_at, dry_run }); + + console.log(`[score_runs] ${r.totals.rows_read} read · ${r.totals.rows_written} written · ${r.totals.rows_skipped} skipped · ${r.totals.rows_deduped} deduped${dry_run ? " (DRY RUN)" : ""}`); + console.log(`[score_runs] categories: accepted=${r.totals.by_category.accepted} partial=${r.totals.by_category.partially_accepted} rejected=${r.totals.by_category.rejected} needs_human=${r.totals.by_category.needs_human_review}`); + for (const s of r.sources) { + const c = s.by_category; + console.log(` ${s.source_file}: read=${s.rows_read} wrote=${s.rows_written} acc=${c.accepted ?? 0} part=${c.partially_accepted ?? 0} rej=${c.rejected ?? 0} hum=${c.needs_human_review ?? 0}`); + } + if (!dry_run) console.log(`[score_runs] receipt: ${r.receipt_path}`); + if (!r.receipt.validation_pass) process.exit(1); +} + +if (import.meta.main) cli().catch(e => { console.error(e); process.exit(1); }); diff --git a/scripts/distillation/scorer.ts b/scripts/distillation/scorer.ts new file mode 100644 index 0000000..5cd292d --- /dev/null +++ b/scripts/distillation/scorer.ts @@ -0,0 +1,303 @@ +// scorer.ts — pure deterministic Success Scorer. +// +// Takes one EvidenceRecord, returns category + reasons + sub_scores. +// NO I/O, NO LLM, NO clock reads, NO mutable state. The only randomness +// allowed is none. Identical input → identical output forever. +// +// Three-class strategy (see docs/recon/local-distillation-recon.md + +// data/_kb/evidence_health.md for the source taxonomy): +// +// CLASS A — verdict-bearing +// scrum_reviews, observer_reviews, audits, contract_analyses +// Direct scoring from existing markers/observer_verdict +// +// CLASS B — telemetry-rich +// auto_apply, outcomes, mode_experiments +// Markers exist but partial; needs_human_review fills the gap +// +// CLASS C — pure-extraction (no native scoring signal) +// distilled_*, audit_facts, observer_escalations +// Default needs_human_review; v2 will JOIN to parent verdict +// +// scorer_version is stamped on every output. Bumping it lets a +// downstream re-scoring detect drift between historical runs. + +import type { EvidenceRecord } from "../../auditor/schemas/distillation/evidence_record"; +import type { ScoreCategory, ScoredRun } from "../../auditor/schemas/distillation/scored_run"; +import { SCORED_RUN_SCHEMA_VERSION } from "../../auditor/schemas/distillation/scored_run"; +import { canonicalSha256 } from "../../auditor/schemas/distillation/types"; + +export const SCORER_VERSION = process.env.LH_SCORER_VERSION ?? "v1.0.0"; + +export interface ScoreOutput { + category: ScoreCategory; + reasons: string[]; + sub_scores: ScoredRun["sub_scores"]; +} + +// Map source_file (from provenance) → source class. Centralized so +// adding a new source is one-line. +type SourceClass = "verdict" | "telemetry" | "extraction"; + +function sourceClassFor(source_file: string): SourceClass { + // Strip data/_kb/ prefix and .jsonl suffix to compare by stem + const stem = source_file.replace(/^data\/_kb\//, "").replace(/\.jsonl$/, ""); + switch (stem) { + case "scrum_reviews": + case "observer_reviews": + case "audits": + case "contract_analyses": + return "verdict"; + case "auto_apply": + case "outcomes": + case "mode_experiments": + return "telemetry"; + case "distilled_facts": + case "distilled_procedures": + case "distilled_config_hints": + case "audit_facts": + case "observer_escalations": + return "extraction"; + default: + // Unknown source — route to extraction (most conservative — + // forces needs_human_review until a transform is added). + return "extraction"; + } +} + +// ─── Class A: verdict-bearing ───────────────────────────────────── + +function scoreScrumReview(r: EvidenceRecord): ScoreOutput { + const reasons: string[] = []; + const subs: ScoredRun["sub_scores"] = {}; + + const successMarker = (r.success_markers ?? []).find(m => m.startsWith("accepted_on_attempt_")); + if (!successMarker) { + reasons.push("scrum_review missing accepted_on_attempt_* success marker"); + return { category: "needs_human_review", reasons, sub_scores: subs }; + } + const attempt = Number(successMarker.replace("accepted_on_attempt_", "")); + subs.accepted_on_attempt = attempt; + if (attempt === 1) { + reasons.push("scrum: accepted on first attempt"); + return { category: "accepted", reasons, sub_scores: subs }; + } + if (attempt <= 3) { + reasons.push(`scrum: accepted after ${attempt} attempts`); + return { category: "partially_accepted", reasons, sub_scores: subs }; + } + reasons.push(`scrum: accepted only after ${attempt} attempts (high-cost path)`); + return { category: "partially_accepted", reasons, sub_scores: subs }; +} + +function scoreObserverReview(r: EvidenceRecord): ScoreOutput { + const reasons: string[] = []; + const subs: ScoredRun["sub_scores"] = {}; + const v = r.observer_verdict; + if (v === "accept") { + subs.observer_verdict = "accept"; + reasons.push("observer accepted the reviewed attempt"); + return { category: "accepted", reasons, sub_scores: subs }; + } + if (v === "reject") { + subs.observer_verdict = "reject"; + reasons.push("observer rejected the reviewed attempt"); + return { category: "rejected", reasons, sub_scores: subs }; + } + if (v === "cycle") { + subs.observer_verdict = "cycle"; + reasons.push("observer flagged the attempt as cycling — partial signal"); + return { category: "partially_accepted", reasons, sub_scores: subs }; + } + reasons.push(`observer_verdict missing or unrecognized: ${JSON.stringify(v ?? null)}`); + return { category: "needs_human_review", reasons, sub_scores: subs }; +} + +function scoreAudit(r: EvidenceRecord): ScoreOutput { + // audits.jsonl is the auditor's per-finding stream (not PR verdicts). + // Phase 2 transform encodes severity into markers: + // audit_severity_{info,low} → accepted (minor finding) + // audit_severity_medium → partially_accepted + // audit_severity_{high,critical} → rejected (real problem) + // Older "approved"/"blocked"/"request_changes" markers also handled + // for back-compat with any pre-fix materializations on disk. + const reasons: string[] = []; + const subs: ScoredRun["sub_scores"] = {}; + const succ = r.success_markers ?? []; + const fail = r.failure_markers ?? []; + + if (succ.includes("approved")) { + reasons.push("audit overall=approved (legacy marker)"); + return { category: "accepted", reasons, sub_scores: subs }; + } + if (fail.includes("blocked")) { + reasons.push("audit overall=block (legacy marker)"); + return { category: "rejected", reasons, sub_scores: subs }; + } + if (fail.includes("request_changes")) { + reasons.push("audit overall=request_changes (legacy marker)"); + return { category: "partially_accepted", reasons, sub_scores: subs }; + } + + // Severity-derived markers (current Phase 2 transform): + const sevSucc = succ.find(m => m.startsWith("audit_severity_")); + const sevFail = fail.find(m => m.startsWith("audit_severity_")); + if (sevSucc) { + reasons.push(`${sevSucc} → minor finding`); + return { category: "accepted", reasons, sub_scores: subs }; + } + if (sevFail === "audit_severity_medium") { + reasons.push("audit_severity_medium → finding warrants review"); + return { category: "partially_accepted", reasons, sub_scores: subs }; + } + if (sevFail === "audit_severity_high" || sevFail === "audit_severity_critical") { + reasons.push(`${sevFail} → blocking finding`); + return { category: "rejected", reasons, sub_scores: subs }; + } + + reasons.push("audit row has no severity or overall marker"); + return { category: "needs_human_review", reasons, sub_scores: subs }; +} + +function scoreContractAnalysis(r: EvidenceRecord): ScoreOutput { + const reasons: string[] = []; + const subs: ScoredRun["sub_scores"] = {}; + const v = r.observer_verdict; + // failure_markers takes precedence: explicit rejection beats absent verdict + if ((r.failure_markers ?? []).includes("observer_rejected") || v === "reject") { + subs.observer_verdict = "reject"; + reasons.push("contract analysis: observer rejected"); + return { category: "rejected", reasons, sub_scores: subs }; + } + if (v === "accept") { + subs.observer_verdict = "accept"; + reasons.push("contract analysis: observer accepted"); + return { category: "accepted", reasons, sub_scores: subs }; + } + if (v === "cycle") { + subs.observer_verdict = "cycle"; + reasons.push("contract analysis: observer cycled (partial)"); + return { category: "partially_accepted", reasons, sub_scores: subs }; + } + reasons.push("contract analysis: no observer verdict signal"); + return { category: "needs_human_review", reasons, sub_scores: subs }; +} + +// ─── Class B: telemetry-rich ────────────────────────────────────── + +function scoreAutoApply(r: EvidenceRecord): ScoreOutput { + const reasons: string[] = []; + const subs: ScoredRun["sub_scores"] = {}; + if ((r.success_markers ?? []).includes("committed")) { + subs.cargo_green = true; + reasons.push("auto_apply: patch committed (cargo green + warning baseline + rationale alignment passed)"); + return { category: "accepted", reasons, sub_scores: subs }; + } + const failures = (r.failure_markers ?? []); + const reverted = failures.find(f => f.includes("reverted")); + if (reverted) { + if (reverted.includes("build_red")) subs.cargo_green = false; + reasons.push(`auto_apply: ${reverted}`); + return { category: "rejected", reasons, sub_scores: subs }; + } + // no_patches / dry_run / all_rejected — not a failure of code, but no commit either + reasons.push("auto_apply: no commit + no revert (likely no_patches or dry_run)"); + return { category: "needs_human_review", reasons, sub_scores: subs }; +} + +function scoreOutcomes(r: EvidenceRecord): ScoreOutput { + const reasons: string[] = []; + const subs: ScoredRun["sub_scores"] = {}; + if ((r.success_markers ?? []).includes("all_events_ok")) { + reasons.push("outcomes: all events ok"); + return { category: "accepted", reasons, sub_scores: subs }; + } + // Validation results may carry partial signal + const gap = r.validation_results?.gap_signals as number | undefined; + if (typeof gap === "number" && gap > 0) { + reasons.push(`outcomes: ${gap} gap signal(s) detected`); + return { category: "partially_accepted", reasons, sub_scores: subs }; + } + reasons.push("outcomes: no decisive marker — defer to human"); + return { category: "needs_human_review", reasons, sub_scores: subs }; +} + +function scoreModeExperiment(r: EvidenceRecord): ScoreOutput { + const reasons: string[] = []; + const subs: ScoredRun["sub_scores"] = {}; + // mode_experiments at Phase 2 lacks markers (transform doesn't derive + // them yet). v1 derivation: a non-empty response with reasonable + // latency is at least partially_accepted; otherwise needs_human_review. + // Anything stronger needs the grounding-from-mode_compare hook in + // Phase 4 / re-scoring. + if (typeof r.text !== "string" || r.text.trim().length === 0) { + reasons.push("mode_experiment: empty response text"); + return { category: "rejected", reasons, sub_scores: subs }; + } + if (typeof r.latency_ms === "number" && r.latency_ms > 120_000) { + reasons.push(`mode_experiment: latency ${r.latency_ms}ms exceeds 2-minute soft cap`); + return { category: "partially_accepted", reasons, sub_scores: subs }; + } + reasons.push("mode_experiment: response present, latency within bounds; verdict not yet wired"); + return { category: "needs_human_review", reasons, sub_scores: subs }; +} + +// ─── Class C: pure-extraction ──────────────────────────────────── + +function scoreExtraction(r: EvidenceRecord): ScoreOutput { + // Phase 3 v1: extraction-class records have no native scoring + // signal. Default to needs_human_review with an explicit reason. + // Phase 3 v2 will JOIN to a parent verdict-bearing record. + const reasons = ["extraction-class source has no native scoring signal — JOIN to parent verdict pending Phase 3 v2"]; + return { category: "needs_human_review", reasons, sub_scores: {} }; +} + +// ─── Dispatch ───────────────────────────────────────────────────── + +export function scoreRecord(record: EvidenceRecord): ScoreOutput { + const cls = sourceClassFor(record.provenance.source_file); + const stem = record.provenance.source_file.replace(/^data\/_kb\//, "").replace(/\.jsonl$/, ""); + + if (cls === "verdict") { + if (stem === "scrum_reviews") return scoreScrumReview(record); + if (stem === "observer_reviews") return scoreObserverReview(record); + if (stem === "audits") return scoreAudit(record); + if (stem === "contract_analyses") return scoreContractAnalysis(record); + } + if (cls === "telemetry") { + if (stem === "auto_apply") return scoreAutoApply(record); + if (stem === "outcomes") return scoreOutcomes(record); + if (stem === "mode_experiments") return scoreModeExperiment(record); + } + return scoreExtraction(record); +} + +// Build a complete ScoredRun. Caller supplies recorded_at + the +// source file / line offset to populate provenance. +export async function buildScoredRun( + record: EvidenceRecord, + source_file_relpath: string, + line_offset: number, + recorded_at: string, +): Promise { + const out = scoreRecord(record); + // Compute provenance.sig_hash over the EvidenceRecord (not raw source); + // ScoredRun traces to the materialized evidence row, not the raw stream. + const sig_hash = await canonicalSha256(record); + return { + schema_version: SCORED_RUN_SCHEMA_VERSION, + evidence_run_id: record.run_id, + evidence_task_id: record.task_id, + category: out.category, + reasons: out.reasons, + scored_at: recorded_at, + scorer_version: SCORER_VERSION, + sub_scores: out.sub_scores, + provenance: { + source_file: source_file_relpath, + line_offset, + sig_hash, + recorded_at, + }, + }; +} diff --git a/scripts/distillation/transforms.ts b/scripts/distillation/transforms.ts index c1f39d0..6146618 100644 --- a/scripts/distillation/transforms.ts +++ b/scripts/distillation/transforms.ts @@ -211,19 +211,30 @@ export const TRANSFORMS: TransformDef[] = [ }), }, { + // 2026-04-26 correction: data/_kb/audits.jsonl is the auditor's + // per-FINDING stream (recon misnamed it "PR verdicts"). Schema: + // {embedding, evidence, finding_id, phase, resolution, severity, topic, ts} + // The actual per-PR verdicts live in data/_auditor/verdicts/*.json, + // not in this JSONL. So we score by severity here: info/low → + // accepted (audit found minor issue), medium → partially_accepted, + // high/critical → rejected (real problem in the audited code). source_file_relpath: "data/_kb/audits.jsonl", - transform: ({ row, line_offset, source_file_relpath, recorded_at, sig_hash }) => ({ - run_id: `audit:${row.head_sha ?? line_offset}`, - task_id: `pr:${row.pr_number}`, - timestamp: row.audited_at ?? new Date().toISOString(), - schema_version: EVIDENCE_SCHEMA_VERSION, - provenance: provenance({ row, line_offset, source_file_relpath, recorded_at, sig_hash }), - model_role: "reviewer" as ModelRole, - success_markers: row.overall === "approve" ? ["approved"] : undefined, - failure_markers: row.overall === "block" ? ["blocked"] : (row.overall === "request_changes" ? ["request_changes"] : undefined), - validation_results: { schema_valid: true, [`overall_${row.overall ?? "?"}`]: true }, - text: row.one_liner ?? "", - }), + transform: ({ row, line_offset, source_file_relpath, recorded_at, sig_hash }) => { + const sev = String(row.severity ?? "unknown").toLowerCase(); + const minor = sev === "info" || sev === "low"; + const blocking = sev === "high" || sev === "critical"; + return { + run_id: `audit_finding:${row.finding_id ?? line_offset}`, + task_id: row.phase ? `phase:${row.phase}` : "audit_finding", + timestamp: row.ts ?? new Date().toISOString(), + schema_version: EVIDENCE_SCHEMA_VERSION, + provenance: provenance({ row, line_offset, source_file_relpath, recorded_at, sig_hash }), + model_role: "reviewer" as ModelRole, + success_markers: minor ? [`audit_severity_${sev}`] : undefined, + failure_markers: blocking ? [`audit_severity_${sev}`] : (sev === "medium" ? ["audit_severity_medium"] : undefined), + text: typeof row.evidence === "string" ? row.evidence : (row.resolution ?? ""), + }; + }, }, { source_file_relpath: "data/_kb/outcomes.jsonl", diff --git a/tests/distillation/score_runs.test.ts b/tests/distillation/score_runs.test.ts new file mode 100644 index 0000000..2251309 --- /dev/null +++ b/tests/distillation/score_runs.test.ts @@ -0,0 +1,157 @@ +// Integration test: score_runs.ts CLI pipeline. Synthesizes evidence +// records, runs scoreAll, asserts behavior on the materialized scored +// runs + receipt. + +import { test, expect, beforeEach, afterEach } from "bun:test"; +import { mkdirSync, writeFileSync, rmSync, existsSync, readFileSync } from "node:fs"; +import { resolve } from "node:path"; + +import { scoreAll } from "../../scripts/distillation/score_runs"; +import { EVIDENCE_SCHEMA_VERSION, type EvidenceRecord } from "../../auditor/schemas/distillation/evidence_record"; +import { validateScoredRun } from "../../auditor/schemas/distillation/scored_run"; +import { validateReceipt } from "../../auditor/schemas/distillation/receipt"; + +const TMP = "/tmp/distillation_test_phase3"; +const RECORDED = "2026-04-26T22:30:00.000Z"; +const SHA = "0".repeat(64); + +function makeEv(opts: Partial & { source_stem: string }): EvidenceRecord { + return { + run_id: opts.run_id ?? `run-${Math.random()}`, + task_id: opts.task_id ?? "task-test", + timestamp: opts.timestamp ?? RECORDED, + schema_version: EVIDENCE_SCHEMA_VERSION, + provenance: { + source_file: `data/_kb/${opts.source_stem}.jsonl`, + line_offset: 0, + sig_hash: SHA, + recorded_at: RECORDED, + }, + ...opts, + } as EvidenceRecord; +} + +function writeEvidence(ev: EvidenceRecord[], stem: string) { + const partition = "2026/04/27"; + const dir = resolve(TMP, "data/evidence", partition); + mkdirSync(dir, { recursive: true }); + writeFileSync(resolve(dir, `${stem}.jsonl`), ev.map(r => JSON.stringify(r)).join("\n") + "\n"); +} + +function setup() { + if (existsSync(TMP)) rmSync(TMP, { recursive: true, force: true }); + mkdirSync(resolve(TMP, "data/_kb"), { recursive: true }); + + // Mix of every category across sources + writeEvidence([ + makeEv({ source_stem: "scrum_reviews", run_id: "s1", success_markers: ["accepted_on_attempt_1"] }), + makeEv({ source_stem: "scrum_reviews", run_id: "s2", success_markers: ["accepted_on_attempt_3"] }), + makeEv({ source_stem: "scrum_reviews", run_id: "s3" }), // no markers → human + ], "scrum_reviews"); + + writeEvidence([ + makeEv({ source_stem: "audits", run_id: "a1", success_markers: ["approved"] }), + makeEv({ source_stem: "audits", run_id: "a2", failure_markers: ["blocked"] }), + makeEv({ source_stem: "audits", run_id: "a3", failure_markers: ["request_changes"] }), + ], "audits"); + + writeEvidence([ + makeEv({ source_stem: "auto_apply", run_id: "ap1", success_markers: ["committed"] }), + makeEv({ source_stem: "auto_apply", run_id: "ap2", failure_markers: ["build_red_reverted"] }), + makeEv({ source_stem: "auto_apply", run_id: "ap3" }), + ], "auto_apply"); + + writeEvidence([ + makeEv({ source_stem: "distilled_facts", run_id: "df1", text: "extracted fact" }), + ], "distilled_facts"); +} + +beforeEach(setup); +afterEach(() => { if (existsSync(TMP)) rmSync(TMP, { recursive: true, force: true }); }); + +test("score_runs: emits ScoredRun for every EvidenceRecord", async () => { + const r = await scoreAll({ root: TMP, recorded_at: RECORDED }); + expect(r.totals.rows_read).toBe(10); + expect(r.totals.rows_written).toBe(10); + expect(r.totals.rows_skipped).toBe(0); +}); + +test("score_runs: category distribution matches expected per source", async () => { + const r = await scoreAll({ root: TMP, recorded_at: RECORDED }); + // 1 (s1) + 1 (a1) + 1 (ap1) = 3 accepted + // 1 (s2) + 1 (a3) = 2 partial + // 1 (a2) + 1 (ap2) = 2 rejected + // 1 (s3) + 1 (ap3) + 1 (df1) = 3 needs_human + expect(r.totals.by_category.accepted).toBe(3); + expect(r.totals.by_category.partially_accepted).toBe(2); + expect(r.totals.by_category.rejected).toBe(2); + expect(r.totals.by_category.needs_human_review).toBe(3); +}); + +test("score_runs: every output row validates against ScoredRun schema", async () => { + await scoreAll({ root: TMP, recorded_at: RECORDED }); + const dir = resolve(TMP, "data/scored-runs/2026/04/27"); + for (const stem of ["scrum_reviews", "audits", "auto_apply", "distilled_facts"]) { + const path = resolve(dir, `${stem}.jsonl`); + expect(existsSync(path)).toBe(true); + const lines = readFileSync(path, "utf8").trim().split("\n").filter(Boolean); + for (const line of lines) { + const v = validateScoredRun(JSON.parse(line)); + expect(v.valid).toBe(true); + } + } +}); + +test("score_runs: idempotent — second run produces 0 new writes", async () => { + await scoreAll({ root: TMP, recorded_at: RECORDED }); + const r2 = await scoreAll({ root: TMP, recorded_at: RECORDED }); + expect(r2.totals.rows_written).toBe(0); + expect(r2.totals.rows_deduped).toBe(10); +}); + +test("score_runs: receipt validates and pins git_sha + record_counts + by_category", async () => { + const r = await scoreAll({ root: TMP, recorded_at: RECORDED }); + const v = validateReceipt(r.receipt); + expect(v.valid).toBe(true); + expect(r.receipt.git_sha).toMatch(/^[0-9a-f]{40}$/); + expect(r.receipt.record_counts.in).toBe(10); + expect(r.receipt.record_counts.out).toBe(10); + expect(r.receipt.record_counts.cat_accepted).toBe(3); + expect(r.receipt.record_counts.cat_partially_accepted).toBe(2); + expect(r.receipt.record_counts.cat_rejected).toBe(2); + expect(r.receipt.record_counts.cat_needs_human_review).toBe(3); + expect(r.receipt.validation_pass).toBe(true); // 0 skips +}); + +test("score_runs: every output row carries provenance + reasons + scorer_version", async () => { + await scoreAll({ root: TMP, recorded_at: RECORDED }); + const path = resolve(TMP, "data/scored-runs/2026/04/27/scrum_reviews.jsonl"); + const rows = readFileSync(path, "utf8").trim().split("\n").map(l => JSON.parse(l)); + for (const row of rows) { + expect(row.provenance.sig_hash).toMatch(/^[0-9a-f]{64}$/); + expect(row.reasons.length).toBeGreaterThan(0); + expect(row.scorer_version).toBeTruthy(); + } +}); + +test("score_runs: malformed evidence row is skipped, valid rows still process", async () => { + // Inject a malformed line into one of the evidence files + const path = resolve(TMP, "data/evidence/2026/04/27/scrum_reviews.jsonl"); + const existing = readFileSync(path, "utf8"); + writeFileSync(path, existing + "{not valid json\n"); + + const r = await scoreAll({ root: TMP, recorded_at: RECORDED }); + expect(r.totals.rows_skipped).toBe(1); + expect(r.totals.rows_written).toBe(10); // valid rows unaffected + expect(r.receipt.validation_pass).toBe(false); // skips > 0 + expect(existsSync(r.skips_path)).toBe(true); + const skipBody = readFileSync(r.skips_path, "utf8"); + expect(skipBody).toContain("evidence not JSON"); +}); + +test("score_runs: dry-run reports counts but writes no scored-runs", async () => { + const r = await scoreAll({ root: TMP, recorded_at: RECORDED, dry_run: true }); + expect(r.totals.rows_written).toBe(10); + const scoredDir = resolve(TMP, "data/scored-runs"); + expect(existsSync(scoredDir)).toBe(false); +}); diff --git a/tests/distillation/scorer.test.ts b/tests/distillation/scorer.test.ts new file mode 100644 index 0000000..1b641f0 --- /dev/null +++ b/tests/distillation/scorer.test.ts @@ -0,0 +1,297 @@ +// Unit tests on the pure scoreRecord function. No I/O, no fixtures — +// inline EvidenceRecord makers per source class. Each scoring rule +// gets a positive case + at least one boundary case. + +import { test, expect } from "bun:test"; +import { scoreRecord, SCORER_VERSION, buildScoredRun } from "../../scripts/distillation/scorer"; +import { EVIDENCE_SCHEMA_VERSION, type EvidenceRecord, type ModelRole } from "../../auditor/schemas/distillation/evidence_record"; + +const NOW = "2026-04-26T22:30:00.000Z"; +const SHA = "0".repeat(64); + +function makeEvidence(opts: Partial & { source_stem: string }): EvidenceRecord { + return { + run_id: opts.run_id ?? "run-test", + task_id: opts.task_id ?? "task-test", + timestamp: opts.timestamp ?? NOW, + schema_version: EVIDENCE_SCHEMA_VERSION, + provenance: { + source_file: `data/_kb/${opts.source_stem}.jsonl`, + line_offset: 0, + sig_hash: SHA, + recorded_at: NOW, + }, + ...opts, + } as EvidenceRecord; +} + +// ─── Class A: scrum_reviews ─────────────────────────────────────── + +test("scrum_reviews: accepted_on_attempt_1 → accepted", () => { + const r = scoreRecord(makeEvidence({ + source_stem: "scrum_reviews", + success_markers: ["accepted_on_attempt_1"], + })); + expect(r.category).toBe("accepted"); + expect(r.sub_scores?.accepted_on_attempt).toBe(1); + expect(r.reasons.some(x => x.includes("first attempt"))).toBe(true); +}); + +test("scrum_reviews: accepted_on_attempt_2 → partially_accepted", () => { + const r = scoreRecord(makeEvidence({ + source_stem: "scrum_reviews", + success_markers: ["accepted_on_attempt_2"], + })); + expect(r.category).toBe("partially_accepted"); +}); + +test("scrum_reviews: accepted_on_attempt_5 → partially_accepted with high-cost reason", () => { + const r = scoreRecord(makeEvidence({ + source_stem: "scrum_reviews", + success_markers: ["accepted_on_attempt_5"], + })); + expect(r.category).toBe("partially_accepted"); + expect(r.reasons.some(x => x.includes("5 attempts"))).toBe(true); +}); + +test("scrum_reviews: no success_markers → needs_human_review", () => { + const r = scoreRecord(makeEvidence({ source_stem: "scrum_reviews" })); + expect(r.category).toBe("needs_human_review"); +}); + +// ─── Class A: observer_reviews ──────────────────────────────────── + +test("observer_reviews: accept → accepted", () => { + const r = scoreRecord(makeEvidence({ source_stem: "observer_reviews", observer_verdict: "accept" })); + expect(r.category).toBe("accepted"); + expect(r.sub_scores?.observer_verdict).toBe("accept"); +}); + +test("observer_reviews: reject → rejected", () => { + const r = scoreRecord(makeEvidence({ source_stem: "observer_reviews", observer_verdict: "reject" })); + expect(r.category).toBe("rejected"); +}); + +test("observer_reviews: cycle → partially_accepted", () => { + const r = scoreRecord(makeEvidence({ source_stem: "observer_reviews", observer_verdict: "cycle" })); + expect(r.category).toBe("partially_accepted"); +}); + +test("observer_reviews: missing verdict → needs_human_review", () => { + const r = scoreRecord(makeEvidence({ source_stem: "observer_reviews" })); + expect(r.category).toBe("needs_human_review"); +}); + +// ─── Class A: audits (per-finding stream, severity-based) ──────── + +test("audits: severity_info → accepted (minor finding)", () => { + const r = scoreRecord(makeEvidence({ + source_stem: "audits", + success_markers: ["audit_severity_info"], + })); + expect(r.category).toBe("accepted"); +}); + +test("audits: severity_low → accepted", () => { + const r = scoreRecord(makeEvidence({ + source_stem: "audits", + success_markers: ["audit_severity_low"], + })); + expect(r.category).toBe("accepted"); +}); + +test("audits: severity_medium → partially_accepted", () => { + const r = scoreRecord(makeEvidence({ + source_stem: "audits", + failure_markers: ["audit_severity_medium"], + })); + expect(r.category).toBe("partially_accepted"); +}); + +test("audits: severity_high → rejected", () => { + const r = scoreRecord(makeEvidence({ + source_stem: "audits", + failure_markers: ["audit_severity_high"], + })); + expect(r.category).toBe("rejected"); +}); + +test("audits: severity_critical → rejected", () => { + const r = scoreRecord(makeEvidence({ + source_stem: "audits", + failure_markers: ["audit_severity_critical"], + })); + expect(r.category).toBe("rejected"); +}); + +// Legacy markers preserved for back-compat with pre-fix data on disk +test("audits: legacy 'approved' still maps to accepted", () => { + const r = scoreRecord(makeEvidence({ + source_stem: "audits", + success_markers: ["approved"], + })); + expect(r.category).toBe("accepted"); +}); + +test("audits: legacy 'blocked' still maps to rejected", () => { + const r = scoreRecord(makeEvidence({ + source_stem: "audits", + failure_markers: ["blocked"], + })); + expect(r.category).toBe("rejected"); +}); + +// ─── Class A: contract_analyses ─────────────────────────────────── + +test("contract_analyses: observer_rejected failure marker → rejected", () => { + const r = scoreRecord(makeEvidence({ + source_stem: "contract_analyses", + failure_markers: ["observer_rejected"], + observer_verdict: "reject", + })); + expect(r.category).toBe("rejected"); +}); + +test("contract_analyses: observer accept → accepted", () => { + const r = scoreRecord(makeEvidence({ + source_stem: "contract_analyses", + observer_verdict: "accept", + })); + expect(r.category).toBe("accepted"); +}); + +// ─── Class B: auto_apply ────────────────────────────────────────── + +test("auto_apply: committed → accepted with cargo_green=true", () => { + const r = scoreRecord(makeEvidence({ + source_stem: "auto_apply", + success_markers: ["committed"], + })); + expect(r.category).toBe("accepted"); + expect(r.sub_scores?.cargo_green).toBe(true); +}); + +test("auto_apply: build_red_reverted → rejected with cargo_green=false", () => { + const r = scoreRecord(makeEvidence({ + source_stem: "auto_apply", + failure_markers: ["build_red_reverted"], + })); + expect(r.category).toBe("rejected"); + expect(r.sub_scores?.cargo_green).toBe(false); +}); + +test("auto_apply: warnings_increased_reverted → rejected", () => { + const r = scoreRecord(makeEvidence({ + source_stem: "auto_apply", + failure_markers: ["warnings_increased_reverted"], + })); + expect(r.category).toBe("rejected"); +}); + +test("auto_apply: no markers → needs_human_review", () => { + const r = scoreRecord(makeEvidence({ source_stem: "auto_apply" })); + expect(r.category).toBe("needs_human_review"); +}); + +// ─── Class B: outcomes ──────────────────────────────────────────── + +test("outcomes: all_events_ok → accepted", () => { + const r = scoreRecord(makeEvidence({ + source_stem: "outcomes", + success_markers: ["all_events_ok"], + })); + expect(r.category).toBe("accepted"); +}); + +test("outcomes: gap_signals > 0 → partially_accepted", () => { + const r = scoreRecord(makeEvidence({ + source_stem: "outcomes", + validation_results: { gap_signals: 3 }, + })); + expect(r.category).toBe("partially_accepted"); +}); + +// ─── Class B: mode_experiments ──────────────────────────────────── + +test("mode_experiments: empty text → rejected", () => { + const r = scoreRecord(makeEvidence({ + source_stem: "mode_experiments", + text: "", + })); + expect(r.category).toBe("rejected"); +}); + +test("mode_experiments: latency > 120s → partially_accepted", () => { + const r = scoreRecord(makeEvidence({ + source_stem: "mode_experiments", + text: "valid response", + latency_ms: 150_000, + })); + expect(r.category).toBe("partially_accepted"); +}); + +test("mode_experiments: text + reasonable latency → needs_human_review (no native verdict yet)", () => { + const r = scoreRecord(makeEvidence({ + source_stem: "mode_experiments", + text: "response present", + latency_ms: 10_000, + })); + expect(r.category).toBe("needs_human_review"); +}); + +// ─── Class C: extraction-class default ──────────────────────────── + +test("distilled_facts: no native verdict → needs_human_review", () => { + const r = scoreRecord(makeEvidence({ source_stem: "distilled_facts", text: "extracted fact" })); + expect(r.category).toBe("needs_human_review"); +}); + +test("distilled_procedures: no native verdict → needs_human_review", () => { + const r = scoreRecord(makeEvidence({ source_stem: "distilled_procedures" })); + expect(r.category).toBe("needs_human_review"); +}); + +test("audit_facts: extraction-class → needs_human_review", () => { + const r = scoreRecord(makeEvidence({ source_stem: "audit_facts" })); + expect(r.category).toBe("needs_human_review"); +}); + +test("observer_escalations: extraction-class → needs_human_review", () => { + const r = scoreRecord(makeEvidence({ source_stem: "observer_escalations" })); + expect(r.category).toBe("needs_human_review"); +}); + +test("unknown source: defaults to extraction class → needs_human_review", () => { + const r = scoreRecord(makeEvidence({ source_stem: "some_future_stream" })); + expect(r.category).toBe("needs_human_review"); +}); + +// ─── Universal invariants ───────────────────────────────────────── + +test("every score has at least one reason (reasons non-empty)", () => { + // Sample a scoring of every source we know about + const sources = ["scrum_reviews", "observer_reviews", "audits", "contract_analyses", + "auto_apply", "outcomes", "mode_experiments", "distilled_facts"]; + for (const s of sources) { + const r = scoreRecord(makeEvidence({ source_stem: s })); + expect(r.reasons.length).toBeGreaterThanOrEqual(1); + for (const reason of r.reasons) expect(reason.length).toBeGreaterThan(0); + } +}); + +test("buildScoredRun stamps SCORER_VERSION + computes provenance.sig_hash", async () => { + const ev = makeEvidence({ source_stem: "scrum_reviews", success_markers: ["accepted_on_attempt_1"] }); + const scored = await buildScoredRun(ev, "data/scored-runs/2026/04/27/scrum_reviews.jsonl", 0, NOW); + expect(scored.scorer_version).toBe(SCORER_VERSION); + expect(scored.evidence_run_id).toBe(ev.run_id); + expect(scored.evidence_task_id).toBe(ev.task_id); + expect(scored.provenance.sig_hash).toMatch(/^[0-9a-f]{64}$/); + expect(scored.provenance.source_file).toBe("data/scored-runs/2026/04/27/scrum_reviews.jsonl"); +}); + +test("buildScoredRun is deterministic — same input → same sig_hash", async () => { + const ev = makeEvidence({ source_stem: "scrum_reviews", success_markers: ["accepted_on_attempt_1"] }); + const a = await buildScoredRun(ev, "p", 0, NOW); + const b = await buildScoredRun(ev, "p", 0, NOW); + expect(a.provenance.sig_hash).toBe(b.provenance.sig_hash); +});