distillation: Phase 0 recon + Phase 1 schemas + Phase 2 transforms scaffold
Some checks failed
lakehouse/auditor 9 blocking issues: todo!() macro call in tests/real-world/scrum_master_pipeline.ts

Phase 0 — docs/recon/local-distillation-recon.md
Inventories the 23 KB JSONL streams + 20 vector corpora + auditor's
kb_index.ts as substrate for the now.md distillation pipeline. Maps
spec modules to existing producers, identifies real gaps, lists 9
schemas to formalize. ZERO implementation in recon — gating doc only.

Phase 1 — auditor/schemas/distillation/
9 schemas + foundation types + 48 tests passing in 502ms:

  types.ts                      shared validators + canonicalSha256
  evidence_record.ts            EVIDENCE_SCHEMA_VERSION=1, ModelRole enum
  scored_run.ts                 4 categories pinned, anchor_grounding ∈ [0,1]
  receipt.ts                    git_sha 40-char, sha256 file refs, validation_pass:bool
  playbook.ts                   non-empty source_run_ids + acceptance_criteria
  scratchpad_summary.ts         validation_status enum, hash sha256
  model_ledger.ts               success_rate ∈ [0,1], sample_count ≥ 1
  rag_sample.ts                 success_score ∈ {accepted, partially_accepted}
  sft_sample.ts                 quality_score MUST be 'accepted' (no leak)
  preference_sample.ts          chosen != rejected, source_run_ids must differ
  evidence_record.test.ts       10 tests, JSON-fixture round-trip
  schemas.test.ts               30 tests, inline fixtures
  realdata.test.ts              8 tests, real-JSONL probe

Real-data validation probe (one of the 3 notables from recon):
46 rows across 7 sources, 100% pass. distilled_facts/procedures alive.
Report at data/_kb/realdata_validation_report.md (also written by the
test). Confirms schema fits existing producers without migration.

Phase 2 scaffold — scripts/distillation/transforms.ts
Promoted PROBES from realdata.test.ts into a real TRANSFORMS array
covering 12 source streams (8 Tier 1 validated + 4 Tier 2 from
recon's untested-streams list). Pure functions: no I/O, no model
calls, no clock reads. Caller supplies recorded_at + sig_hash so
materializer is deterministic by construction.

Spec non-negotiables enforced at schema layer (defense in depth):
  - provenance{source_file, sig_hash, recorded_at} required everywhere
  - schema_version mismatch hard-rejects (forward-compat gate)
  - SFT no-leak: validateSftSample REJECTS partially_accepted, rejected,
    needs_human_review — three explicit tests
  - Every score has WHY (reasons non-empty)
  - Every playbook traces to source (source_run_ids non-empty)
  - Every preference has WHY (reason non-empty)
  - Receipts substantive (git_sha 40-char, sha256 64-char, validation_pass:bool)

Branch carries uncommitted auditor rebuild work (mode.rs + modes.toml
+ inference.ts + static.ts) blocked on upstream Ollama Cloud kimi-k2
500 ISE; held pending recon-driven design decisions.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
root 2026-04-26 22:30:38 -05:00
parent f753e11157
commit 27b1d27605
20 changed files with 2285 additions and 0 deletions

View File

@ -0,0 +1,116 @@
// EvidenceRecord schema tests.
//
// Two positive fixtures (one per real-source prototype: distilled_facts
// + contract_analyses) and three negative fixtures pinning the
// non-negotiable invariants the spec demands:
// - every record must trace to a source (provenance)
// - schema_version must match — silent v1/v2 drift is the worst kind
// - required identity fields (run_id) cannot be missing
//
// Run with: bun test auditor/schemas/distillation/evidence_record.test.ts
import { test, expect } from "bun:test";
import { readFileSync } from "node:fs";
import { resolve } from "node:path";
import { validateEvidenceRecord, EVIDENCE_SCHEMA_VERSION } from "./evidence_record";
const FIXTURE_DIR = resolve(import.meta.dir, "fixtures");
function loadFixture(name: string): unknown {
return JSON.parse(readFileSync(resolve(FIXTURE_DIR, name), "utf8"));
}
test("EVIDENCE_SCHEMA_VERSION is 1 — bump deliberately, never silently", () => {
expect(EVIDENCE_SCHEMA_VERSION).toBe(1);
});
test("positive: distilled_fact materialized record validates", () => {
const r = validateEvidenceRecord(loadFixture("evidence_positive_distilled_fact.json"));
if (!r.valid) console.error("unexpected errors:", r.errors);
expect(r.valid).toBe(true);
if (r.valid) {
expect(r.value.run_id).toBe("cae21289");
expect(r.value.model_role).toBe("extractor");
expect(r.value.provenance.source_file).toBe("data/_kb/distilled_facts.jsonl");
}
});
test("positive: contract_analysis materialized record validates with retrieval + observer fields", () => {
const r = validateEvidenceRecord(loadFixture("evidence_positive_contract_analysis.json"));
if (!r.valid) console.error("unexpected errors:", r.errors);
expect(r.valid).toBe(true);
if (r.valid) {
expect(r.value.observer_verdict).toBe("reject");
expect(r.value.observer_confidence).toBe(95);
expect(r.value.retrieved_context?.matrix_corpora?.length).toBe(4);
expect(r.value.failure_markers).toContain("observer_rejected");
}
});
test("negative: missing run_id is rejected with a specific error", () => {
const r = validateEvidenceRecord(loadFixture("evidence_negative_no_run_id.json"));
expect(r.valid).toBe(false);
if (!r.valid) {
expect(r.errors.some(e => e.includes("run_id"))).toBe(true);
}
});
test("negative: schema_version mismatch is rejected (silent v1/v2 drift guard)", () => {
const r = validateEvidenceRecord(loadFixture("evidence_negative_bad_schema_version.json"));
expect(r.valid).toBe(false);
if (!r.valid) {
expect(r.errors.some(e => e.includes("schema_version"))).toBe(true);
}
});
test("negative: bad provenance (non-sha256 sig_hash, non-ISO timestamp) is rejected", () => {
const r = validateEvidenceRecord(loadFixture("evidence_negative_bad_provenance.json"));
expect(r.valid).toBe(false);
if (!r.valid) {
// Must catch BOTH the sig_hash AND the recorded_at — comprehensive
// error reporting is part of the contract.
expect(r.errors.some(e => e.includes("sig_hash"))).toBe(true);
expect(r.errors.some(e => e.includes("recorded_at"))).toBe(true);
}
});
test("negative: non-object input is rejected with clear error", () => {
const r = validateEvidenceRecord("not an object");
expect(r.valid).toBe(false);
if (!r.valid) {
expect(r.errors[0]).toContain("expected object");
}
});
test("negative: human_override with invalid decision is rejected", () => {
const fixture = loadFixture("evidence_positive_distilled_fact.json") as Record<string, unknown>;
fixture.human_override = {
overrider: "test-user",
decision: "maybe", // invalid — must be accept|reject|needs_review
reason: "test",
overridden_at: "2026-04-26T22:30:00.000Z",
};
const r = validateEvidenceRecord(fixture);
expect(r.valid).toBe(false);
if (!r.valid) {
expect(r.errors.some(e => e.includes("human_override.decision"))).toBe(true);
}
});
test("positive: human_override = null is allowed (explicitly no override)", () => {
const fixture = loadFixture("evidence_positive_distilled_fact.json") as Record<string, unknown>;
fixture.human_override = null;
const r = validateEvidenceRecord(fixture);
expect(r.valid).toBe(true);
});
test("negative: observer_confidence outside [0, 100] is rejected", () => {
const fixture = loadFixture("evidence_positive_contract_analysis.json") as Record<string, unknown>;
fixture.observer_confidence = 150;
const r = validateEvidenceRecord(fixture);
expect(r.valid).toBe(false);
if (!r.valid) {
expect(r.errors.some(e => e.includes("observer_confidence"))).toBe(true);
}
});

View File

@ -0,0 +1,194 @@
// EvidenceRecord — the unified per-execution-trace record that the
// Evidence View emits and the Success Scorer reads.
//
// Derived from now.md spec + reconciliation of two existing prototypes:
// - distilled_facts.jsonl / distilled_procedures.jsonl (LLM-extracted
// text with run_id + sig_hash + extractor + verifier + embedding)
// - contract_analyses.jsonl (observer integration + retrieval
// telemetry + cost + duration)
//
// Required fields are the ones every record MUST have for traceability:
// run_id, task_id, timestamp, schema_version, provenance. Everything
// else is typed-but-optional because no single source has all of them
// — the Evidence View materializes them by JOINing across streams when
// the source data is present.
//
// schema_version starts at 1 and gets bumped on breaking changes.
// Validators MUST check schema_version and refuse unknown values so a
// future v2 reader doesn't silently accept v1 records (or vice versa).
import {
ValidationResult, Provenance,
requireString, requireNumber, requireIsoTimestamp, requireProvenance, requireStringArray,
} from "./types";
export const EVIDENCE_SCHEMA_VERSION = 1;
export type ModelRole =
| "executor" // produced the answer (e.g. scrum reviewer, mode runner LLM call)
| "reviewer" // judged an executor output (e.g. observer, hand-review)
| "extractor" // pulled structured data from text (e.g. fact_extractor)
| "verifier" // confirmed/rejected an extracted claim (verifier in distilled_*)
| "categorizer" // assigned a category (categorizer in distilled_*)
| "tiebreaker" // resolved a consensus split
| "applier" // landed code (scrum_applier)
| "embedder" // produced embeddings
| "other";
export interface EvidenceRecord {
// ── Identity ──
// run_id ties this record to a specific execution. Sources use it
// inconsistently (some stream-level, some per-call). The Evidence
// View canonicalizes to per-call; if the source is stream-level,
// synthesize as `${stream_run_id}:${row_index}`.
run_id: string;
// task_id groups records by logical task (e.g. one PR = one task_id
// across multiple per-call runs). Defaults to run_id when no group
// exists — never null.
task_id: string;
// ISO 8601 of when the EXECUTION happened, not when this record was
// materialized. Use the source row's timestamp; provenance carries
// the materialization time separately.
timestamp: string;
schema_version: number;
// ── Provenance ── (required — no record without source linkage)
provenance: Provenance;
// ── Model attribution (optional) ──
model_name?: string; // e.g. "kimi-k2:1t", "gpt-oss:120b"
model_provider?: string; // e.g. "ollama_cloud", "openrouter", "ollama"
model_role?: ModelRole;
// ── Content hashes (optional) ──
// sha256 of the full input prompt and full output content. Pre-
// computed so the Evidence Index can dedup across re-runs of the
// same prompt without re-hashing.
input_hash?: string;
output_hash?: string;
// ── Repo + execution context ──
source_files?: string[]; // files the run touched/read
commands_run?: string[]; // shell commands or tool calls fired
retrieved_context?: { // what the model saw via retrieval
matrix_corpora?: string[];
matrix_hits?: number;
matrix_chunks_kept?: number;
matrix_chunks_dropped?: number;
pathway_fingerprints_seen?: number;
};
// ── Observer + scratchpad ──
observer_notes?: string[]; // observer.review() free-form notes
observer_verdict?: "accept" | "reject" | "cycle" | string;
observer_confidence?: number; // 0-100
scratchpad_summary?: string; // tree-split scratchpad text or hash ref
// ── Outcome markers ──
// Both arrays exist because a run can have multiple succeeded gates
// AND multiple failed gates simultaneously. Empty arrays are valid;
// missing arrays are also valid (means "no evidence either way").
success_markers?: string[]; // e.g. "cargo_green", "tests_passed", "anchor_grounded"
failure_markers?: string[]; // e.g. "warning_count_up", "rationale_mismatch", "consensus_split"
// ── Validation telemetry ──
validation_results?: {
grounded_fraction?: number; // mode_compare grounding %
schema_valid?: boolean;
pathway_replay_succeeded?: boolean;
[key: string]: unknown;
};
// ── Human-in-loop ──
human_override?: {
overrider: string; // user identifier
decision: "accept" | "reject" | "needs_review";
reason: string;
overridden_at: string; // ISO 8601
} | null;
// ── Performance ──
cost_usd?: number;
latency_ms?: number;
prompt_tokens?: number;
completion_tokens?: number;
// ── Free-form text content (the actual run output) ──
// Optional because some sources are pure metadata (auto_apply.jsonl)
// and have no text payload. Present for distilled_*, contract_analyses,
// mode_experiments, scrum_reviews etc.
text?: string;
}
export function validateEvidenceRecord(input: unknown): ValidationResult<EvidenceRecord> {
const errors: string[] = [];
if (typeof input !== "object" || input === null) {
return { valid: false, errors: ["expected object, got " + (input === null ? "null" : typeof input)] };
}
const r = input as Record<string, unknown>;
// Required
let ok = true;
ok = requireString(r.run_id, "run_id", errors) && ok;
ok = requireString(r.task_id, "task_id", errors) && ok;
ok = requireIsoTimestamp(r.timestamp, "timestamp", errors) && ok;
ok = requireProvenance(r.provenance, "provenance", errors) && ok;
if (r.schema_version !== EVIDENCE_SCHEMA_VERSION) {
errors.push(`schema_version: expected ${EVIDENCE_SCHEMA_VERSION}, got ${JSON.stringify(r.schema_version)}`);
ok = false;
}
// Optional but typed-when-present
if (r.model_role !== undefined) {
const valid: ModelRole[] = ["executor", "reviewer", "extractor", "verifier", "categorizer", "tiebreaker", "applier", "embedder", "other"];
if (!valid.includes(r.model_role as ModelRole)) {
errors.push(`model_role: must be one of ${valid.join("|")}, got ${JSON.stringify(r.model_role)}`);
ok = false;
}
}
if (r.input_hash !== undefined && !/^[0-9a-f]{64}$/.test(String(r.input_hash))) {
errors.push("input_hash: must be hex sha256 when present");
ok = false;
}
if (r.output_hash !== undefined && !/^[0-9a-f]{64}$/.test(String(r.output_hash))) {
errors.push("output_hash: must be hex sha256 when present");
ok = false;
}
if (r.source_files !== undefined && !requireStringArray(r.source_files, "source_files", errors)) ok = false;
if (r.commands_run !== undefined && !requireStringArray(r.commands_run, "commands_run", errors)) ok = false;
if (r.success_markers !== undefined && !requireStringArray(r.success_markers, "success_markers", errors)) ok = false;
if (r.failure_markers !== undefined && !requireStringArray(r.failure_markers, "failure_markers", errors)) ok = false;
if (r.observer_notes !== undefined && !requireStringArray(r.observer_notes, "observer_notes", errors)) ok = false;
if (r.observer_confidence !== undefined) {
if (!requireNumber(r.observer_confidence, "observer_confidence", errors)) ok = false;
else if ((r.observer_confidence as number) < 0 || (r.observer_confidence as number) > 100) {
errors.push("observer_confidence: must be in [0, 100]");
ok = false;
}
}
if (r.human_override !== undefined && r.human_override !== null) {
const ho = r.human_override as Record<string, unknown>;
if (typeof ho !== "object") {
errors.push("human_override: expected object or null");
ok = false;
} else {
ok = requireString(ho.overrider, "human_override.overrider", errors) && ok;
ok = requireString(ho.reason, "human_override.reason", errors) && ok;
ok = requireIsoTimestamp(ho.overridden_at, "human_override.overridden_at", errors) && ok;
if (!["accept", "reject", "needs_review"].includes(ho.decision as string)) {
errors.push(`human_override.decision: must be accept|reject|needs_review`);
ok = false;
}
}
}
if (!ok) return { valid: false, errors };
return { valid: true, value: r as unknown as EvidenceRecord };
}

View File

@ -0,0 +1,11 @@
{
"run_id": "cae21289",
"task_id": "team_runs:637",
"timestamp": "2026-04-23T09:54:40.729599Z",
"schema_version": 1,
"provenance": {
"source_file": "data/_kb/distilled_facts.jsonl",
"sig_hash": "not-a-real-sha256",
"recorded_at": "yesterday"
}
}

View File

@ -0,0 +1,11 @@
{
"run_id": "cae21289",
"task_id": "team_runs:637",
"timestamp": "2026-04-23T09:54:40.729599Z",
"schema_version": 99,
"provenance": {
"source_file": "data/_kb/distilled_facts.jsonl",
"sig_hash": "21a809e2dc43dfae0000000000000000000000000000000000000000deadbeef",
"recorded_at": "2026-04-26T22:30:00.000Z"
}
}

View File

@ -0,0 +1,11 @@
{
"task_id": "team_runs:637",
"timestamp": "2026-04-23T09:54:40.729599Z",
"schema_version": 1,
"provenance": {
"source_file": "data/_kb/distilled_facts.jsonl",
"sig_hash": "21a809e2dc43dfae0000000000000000000000000000000000000000deadbeef",
"recorded_at": "2026-04-26T22:30:00.000Z"
},
"text": "missing run_id should fail validation"
}

View File

@ -0,0 +1,27 @@
{
"run_id": "contract_analysis:101078392:1777250758717",
"task_id": "permit:101078392",
"timestamp": "2026-04-25T23:45:58.717Z",
"schema_version": 1,
"provenance": {
"source_file": "data/_kb/contract_analyses.jsonl",
"line_offset": 0,
"sig_hash": "f1f1f1f1f1f1f1f1f1f1f1f1f1f1f1f1f1f1f1f1f1f1f1f1f1f1f1f1f1f1f1f1",
"recorded_at": "2026-04-26T22:30:00.000Z"
},
"model_name": "kimi-k2:1t",
"model_role": "executor",
"model_provider": "ollama_cloud",
"retrieved_context": {
"matrix_corpora": ["entity_brief_v1", "chicago_permits_v1", "distilled_procedural_v20260423102847", "sec_tickers_v1"],
"matrix_hits": 10
},
"observer_notes": ["contractor history shows 0 prior fills in Chicago downtown zone"],
"observer_verdict": "reject",
"observer_confidence": 95,
"success_markers": ["matrix_hits_above_threshold"],
"failure_markers": ["observer_rejected"],
"cost_usd": 0.0002,
"latency_ms": 25419,
"text": "Permit 101078392 contractor ANTHONY FIORE — analysis: insufficient prior performance signal; recommend escalation."
}

View File

@ -0,0 +1,19 @@
{
"run_id": "cae21289",
"task_id": "team_runs:637",
"timestamp": "2026-04-23T09:54:40.729599Z",
"schema_version": 1,
"provenance": {
"source_file": "data/_kb/distilled_facts.jsonl",
"line_offset": 0,
"sig_hash": "21a809e2dc43dfae0000000000000000000000000000000000000000deadbeef",
"recorded_at": "2026-04-26T22:30:00.000Z"
},
"model_name": "qwen2.5:latest",
"model_role": "extractor",
"model_provider": "ollama",
"text": "Convergence refers to the system stabilizing into a state of high performance with low variance across iterations.",
"validation_results": {
"schema_valid": true
}
}

View File

@ -0,0 +1,56 @@
// ModelLedgerEntry — aggregate per-task-type-per-model performance.
// Built by aggregating mode_experiments.jsonl + model_trust.jsonl.
// Updated rather than appended — one row per (model_name, task_type)
// representing latest aggregates.
import {
ValidationResult, requireString, requireNumber, requireIsoTimestamp, requireStringArray,
} from "./types";
export const MODEL_LEDGER_SCHEMA_VERSION = 1;
export interface ModelLedgerEntry {
schema_version: number;
model_name: string;
model_provider: string;
task_type: string;
success_rate: number; // [0, 1]
failure_modes: string[]; // top failure mode tags
best_partner_model?: string; // pairs well with X (consensus / tie-break)
escalation_role?: string; // when this model gets escalated TO (or FROM)
cost_usd_p50?: number;
latency_ms_p50?: number;
latency_ms_p95?: number;
context_window?: number;
sample_count: number;
last_updated: string; // ISO 8601
notes?: string;
}
export function validateModelLedgerEntry(input: unknown): ValidationResult<ModelLedgerEntry> {
const errors: string[] = [];
if (typeof input !== "object" || input === null) return { valid: false, errors: ["expected object"] };
const r = input as Record<string, unknown>;
let ok = true;
if (r.schema_version !== MODEL_LEDGER_SCHEMA_VERSION) {
errors.push(`schema_version: expected ${MODEL_LEDGER_SCHEMA_VERSION}, got ${JSON.stringify(r.schema_version)}`);
ok = false;
}
ok = requireString(r.model_name, "model_name", errors) && ok;
ok = requireString(r.model_provider, "model_provider", errors) && ok;
ok = requireString(r.task_type, "task_type", errors) && ok;
ok = requireIsoTimestamp(r.last_updated, "last_updated", errors) && ok;
ok = requireStringArray(r.failure_modes, "failure_modes", errors) && ok;
if (!requireNumber(r.success_rate, "success_rate", errors)) ok = false;
else if ((r.success_rate as number) < 0 || (r.success_rate as number) > 1) {
errors.push("success_rate: must be in [0, 1]"); ok = false;
}
if (!requireNumber(r.sample_count, "sample_count", errors)) ok = false;
else if ((r.sample_count as number) < 1 || !Number.isInteger(r.sample_count)) {
errors.push("sample_count: must be positive integer (no aggregate from zero samples)"); ok = false;
}
if (!ok) return { valid: false, errors };
return { valid: true, value: r as unknown as ModelLedgerEntry };
}

View File

@ -0,0 +1,68 @@
// Playbook — procedural knowledge extracted from accepted/partially-
// accepted runs. Different from pathway_memory's bug_fingerprints (which
// are pattern-detectors) — playbooks describe HOW to handle a task type.
import {
ValidationResult, requireString, requireIsoTimestamp, requireProvenance, requireStringArray,
} from "./types";
export const PLAYBOOK_SCHEMA_VERSION = 1;
export interface Playbook {
schema_version: number;
playbook_id: string;
task_type: string; // e.g. "scrum_review", "pr_audit", "staffing.fill"
problem_pattern: string; // when does this playbook apply?
useful_context: string[]; // what to retrieve before running
model_routing_path: string[]; // ordered model attempts that worked
commands_worked: string[];
commands_failed: string[];
validation_steps: string[];
repo_files_touched: string[];
recovery_strategy: string; // what to do when the path fails
known_failure_modes: string[];
escalation_threshold: string; // when to switch to a stronger model
acceptance_criteria: string[]; // how to know it succeeded
source_run_ids: string[]; // FK to EvidenceRecord.run_id (provenance — every playbook traces to source)
created_at: string;
provenance: { source_file: string; line_offset?: number; sig_hash: string; recorded_at: string };
}
export function validatePlaybook(input: unknown): ValidationResult<Playbook> {
const errors: string[] = [];
if (typeof input !== "object" || input === null) return { valid: false, errors: ["expected object"] };
const r = input as Record<string, unknown>;
let ok = true;
if (r.schema_version !== PLAYBOOK_SCHEMA_VERSION) {
errors.push(`schema_version: expected ${PLAYBOOK_SCHEMA_VERSION}, got ${JSON.stringify(r.schema_version)}`);
ok = false;
}
ok = requireString(r.playbook_id, "playbook_id", errors) && ok;
ok = requireString(r.task_type, "task_type", errors) && ok;
ok = requireString(r.problem_pattern, "problem_pattern", errors) && ok;
ok = requireString(r.recovery_strategy, "recovery_strategy", errors) && ok;
ok = requireString(r.escalation_threshold, "escalation_threshold", errors) && ok;
ok = requireIsoTimestamp(r.created_at, "created_at", errors) && ok;
ok = requireStringArray(r.useful_context, "useful_context", errors) && ok;
ok = requireStringArray(r.model_routing_path, "model_routing_path", errors) && ok;
ok = requireStringArray(r.commands_worked, "commands_worked", errors) && ok;
ok = requireStringArray(r.commands_failed, "commands_failed", errors) && ok;
ok = requireStringArray(r.validation_steps, "validation_steps", errors) && ok;
ok = requireStringArray(r.repo_files_touched, "repo_files_touched", errors) && ok;
ok = requireStringArray(r.known_failure_modes, "known_failure_modes", errors) && ok;
ok = requireStringArray(r.acceptance_criteria, "acceptance_criteria", errors) && ok;
ok = requireStringArray(r.source_run_ids, "source_run_ids", errors) && ok;
if (Array.isArray(r.source_run_ids) && r.source_run_ids.length === 0) {
errors.push("source_run_ids: must be non-empty — every playbook traces to source evidence (spec non-negotiable)");
ok = false;
}
if (Array.isArray(r.acceptance_criteria) && r.acceptance_criteria.length === 0) {
errors.push("acceptance_criteria: must be non-empty — every playbook needs success criteria (spec non-negotiable)");
ok = false;
}
ok = requireProvenance(r.provenance, "provenance", errors) && ok;
if (!ok) return { valid: false, errors };
return { valid: true, value: r as unknown as Playbook };
}

View File

@ -0,0 +1,66 @@
// PreferenceSample — entry in exports/preference/chosen_rejected.jsonl.
// Source: real disagreements (audit_discrepancies, scrum ladder retries).
// Validator pins: chosen != rejected, both source_run_ids present, reason
// is non-empty. No synthesized preferences.
import {
ValidationResult, requireString, requireIsoTimestamp, requireProvenance,
} from "./types";
export const PREFERENCE_SAMPLE_SCHEMA_VERSION = 1;
export interface PreferenceSample {
schema_version: number;
prompt: string;
chosen: string;
rejected: string;
reason: string; // why chosen > rejected — must be non-empty
source_run_ids: {
chosen: string;
rejected: string;
};
exported_at: string;
provenance: { source_file: string; line_offset?: number; sig_hash: string; recorded_at: string };
}
export function validatePreferenceSample(input: unknown): ValidationResult<PreferenceSample> {
const errors: string[] = [];
if (typeof input !== "object" || input === null) return { valid: false, errors: ["expected object"] };
const r = input as Record<string, unknown>;
let ok = true;
if (r.schema_version !== PREFERENCE_SAMPLE_SCHEMA_VERSION) {
errors.push(`schema_version: expected ${PREFERENCE_SAMPLE_SCHEMA_VERSION}, got ${JSON.stringify(r.schema_version)}`);
ok = false;
}
ok = requireString(r.prompt, "prompt", errors) && ok;
ok = requireString(r.chosen, "chosen", errors) && ok;
ok = requireString(r.rejected, "rejected", errors) && ok;
ok = requireString(r.reason, "reason", errors) && ok;
ok = requireIsoTimestamp(r.exported_at, "exported_at", errors) && ok;
ok = requireProvenance(r.provenance, "provenance", errors) && ok;
// Self-pairing guard.
if (r.chosen === r.rejected && typeof r.chosen === "string") {
errors.push("chosen and rejected must differ — preference data needs a real disagreement");
ok = false;
}
if (typeof r.reason === "string" && (r.reason as string).trim().length === 0) {
errors.push("reason: must be non-whitespace (every preference needs WHY chosen > rejected)");
ok = false;
}
if (typeof r.source_run_ids !== "object" || r.source_run_ids === null) {
errors.push("source_run_ids: expected object {chosen, rejected}");
ok = false;
} else {
const s = r.source_run_ids as Record<string, unknown>;
ok = requireString(s.chosen, "source_run_ids.chosen", errors) && ok;
ok = requireString(s.rejected, "source_run_ids.rejected", errors) && ok;
if (s.chosen === s.rejected && typeof s.chosen === "string") {
errors.push("source_run_ids.chosen and .rejected must differ — same run can't disagree with itself");
ok = false;
}
}
if (!ok) return { valid: false, errors };
return { valid: true, value: r as unknown as PreferenceSample };
}

View File

@ -0,0 +1,52 @@
// RagSample — entry in exports/rag/playbooks.jsonl. Spec shape exactly,
// plus provenance + success_score (so the index can re-rank by quality).
import {
ValidationResult, requireString, requireNumber, requireIsoTimestamp, requireProvenance, requireStringArray,
} from "./types";
export const RAG_SAMPLE_SCHEMA_VERSION = 1;
export interface RagSample {
schema_version: number;
id: string;
title: string;
content: string;
tags: string[];
source_run_id: string;
success_score: "accepted" | "partially_accepted"; // RAG only ships from these two
embedding_text: string; // the text to embed (often == content but can be shorter)
exported_at: string;
provenance: { source_file: string; line_offset?: number; sig_hash: string; recorded_at: string };
}
export function validateRagSample(input: unknown): ValidationResult<RagSample> {
const errors: string[] = [];
if (typeof input !== "object" || input === null) return { valid: false, errors: ["expected object"] };
const r = input as Record<string, unknown>;
let ok = true;
if (r.schema_version !== RAG_SAMPLE_SCHEMA_VERSION) {
errors.push(`schema_version: expected ${RAG_SAMPLE_SCHEMA_VERSION}, got ${JSON.stringify(r.schema_version)}`);
ok = false;
}
ok = requireString(r.id, "id", errors) && ok;
ok = requireString(r.title, "title", errors) && ok;
ok = requireString(r.content, "content", errors) && ok;
ok = requireString(r.embedding_text, "embedding_text", errors) && ok;
ok = requireString(r.source_run_id, "source_run_id", errors) && ok;
ok = requireIsoTimestamp(r.exported_at, "exported_at", errors) && ok;
ok = requireStringArray(r.tags, "tags", errors) && ok;
ok = requireProvenance(r.provenance, "provenance", errors) && ok;
if (!["accepted", "partially_accepted"].includes(r.success_score as string)) {
errors.push("success_score: must be accepted|partially_accepted (rejected/needs_human never enter RAG)");
ok = false;
}
if (typeof r.content === "string" && (r.content as string).trim().length === 0) {
errors.push("content: must be non-whitespace");
ok = false;
}
if (!ok) return { valid: false, errors };
return { valid: true, value: r as unknown as RagSample };
}

View File

@ -0,0 +1,286 @@
// Real-data validation test — proves the EvidenceRecord schema fits
// what we ALREADY produce, with the minimum transformation each source
// stream requires. Doubles as the stale-extraction probe: if
// distilled_facts.jsonl rows can't materialize, we know that stream
// has rotted and Phase 2 sources from elsewhere.
//
// Strategy:
// 1. Read first N rows from each source jsonl (skip if missing)
// 2. Apply minimal transformer: add schema_version + provenance,
// synthesize run_id/task_id when source doesn't carry them
// 3. Validate each materialized record
// 4. Tally pass/fail per source + collect failure reasons
//
// This file is allowed to skip when source files don't exist (fresh
// clone), so it acts as both a CI guard and a real-environment probe.
import { test, expect } from "bun:test";
import { existsSync, readFileSync } from "node:fs";
import { resolve } from "node:path";
import {
validateEvidenceRecord, EVIDENCE_SCHEMA_VERSION, EvidenceRecord, ModelRole,
} from "./evidence_record";
const ROOT = "/home/profit/lakehouse";
const SAMPLE_PER_SOURCE = 10;
interface SourceProbe {
source_file: string;
transform: (row: any, lineNo: number) => Partial<EvidenceRecord> | null;
}
// Canonical 64-char synthetic sha256 for tests where the source row
// lacks one. Pretends the materializer would compute it via
// canonicalSha256(orderedKeys(row)) at Phase 2 time. We use a fixed
// value here to keep the test deterministic; real materialization
// re-hashes per row.
const PLACEHOLDER_SHA = "0000000000000000000000000000000000000000000000000000000000000000";
const RECORDED = "2026-04-26T22:30:00.000Z";
function provFor(source_file: string, lineNo: number, sigHashRaw?: string): EvidenceRecord["provenance"] {
// Pad shorter hashes (distilled_* uses 16-char) to 64 — mimics
// canonical recompute.
const sig = sigHashRaw && /^[0-9a-f]+$/.test(sigHashRaw)
? sigHashRaw.padEnd(64, "0").slice(0, 64)
: PLACEHOLDER_SHA;
return {
source_file: source_file.replace(`${ROOT}/`, ""),
line_offset: lineNo,
sig_hash: sig,
recorded_at: RECORDED,
};
}
const PROBES: SourceProbe[] = [
{
source_file: `${ROOT}/data/_kb/distilled_facts.jsonl`,
transform: (row: any, lineNo: number) => ({
run_id: String(row.run_id ?? `distilled_facts:${lineNo}`),
task_id: String(row.source_label ?? `distilled_facts:${lineNo}`),
timestamp: row.created_at,
schema_version: EVIDENCE_SCHEMA_VERSION,
provenance: provFor(`${ROOT}/data/_kb/distilled_facts.jsonl`, lineNo, row.sig_hash),
model_name: row.extractor,
model_role: "extractor" as ModelRole,
model_provider: "ollama",
text: row.text,
}),
},
{
source_file: `${ROOT}/data/_kb/distilled_procedures.jsonl`,
transform: (row: any, lineNo: number) => ({
run_id: String(row.run_id ?? `distilled_procedures:${lineNo}`),
task_id: String(row.source_label ?? `distilled_procedures:${lineNo}`),
timestamp: row.created_at,
schema_version: EVIDENCE_SCHEMA_VERSION,
provenance: provFor(`${ROOT}/data/_kb/distilled_procedures.jsonl`, lineNo, row.sig_hash),
model_name: row.extractor,
model_role: "extractor" as ModelRole,
model_provider: "ollama",
text: row.text,
}),
},
{
source_file: `${ROOT}/data/_kb/contract_analyses.jsonl`,
transform: (row: any, lineNo: number) => ({
run_id: `contract_analysis:${row.permit_id}:${new Date(row.ts).getTime()}`,
task_id: `permit:${row.permit_id}`,
timestamp: row.ts,
schema_version: EVIDENCE_SCHEMA_VERSION,
provenance: provFor(`${ROOT}/data/_kb/contract_analyses.jsonl`, lineNo),
model_role: "executor" as ModelRole,
retrieved_context: {
matrix_corpora: Object.keys(row.matrix_corpora ?? {}),
matrix_hits: row.matrix_hits,
},
observer_notes: row.observer_notes ? [row.observer_notes].flat() : undefined,
observer_verdict: row.observer_verdict,
observer_confidence: row.observer_conf,
success_markers: row.ok ? ["matrix_hits_above_threshold"] : undefined,
failure_markers: !row.ok || row.observer_verdict === "reject" ? ["observer_rejected"] : undefined,
cost_usd: typeof row.cost === "number" ? row.cost / 1_000_000 : undefined,
latency_ms: row.duration_ms,
text: row.analysis,
}),
},
{
source_file: `${ROOT}/data/_kb/mode_experiments.jsonl`,
transform: (row: any, lineNo: number) => ({
run_id: `mode_exec:${new Date(row.ts).getTime()}:${row.file_path ?? "?"}`,
task_id: row.task_class,
timestamp: row.ts,
schema_version: EVIDENCE_SCHEMA_VERSION,
provenance: provFor(`${ROOT}/data/_kb/mode_experiments.jsonl`, lineNo),
model_name: row.model,
model_role: "executor" as ModelRole,
model_provider: row.model?.includes("/") ? "openrouter" : "ollama_cloud",
retrieved_context: {
matrix_corpora: row.sources?.matrix_corpus,
matrix_chunks_kept: row.sources?.matrix_chunks_kept,
matrix_chunks_dropped: row.sources?.matrix_chunks_dropped,
pathway_fingerprints_seen: row.sources?.bug_fingerprints_count,
},
latency_ms: row.latency_ms,
text: row.response,
source_files: row.file_path ? [row.file_path] : undefined,
}),
},
{
source_file: `${ROOT}/data/_kb/scrum_reviews.jsonl`,
transform: (row: any, lineNo: number) => ({
run_id: `scrum:${new Date(row.reviewed_at).getTime()}:${row.file}`,
task_id: `scrum_review:${row.file}`,
timestamp: row.reviewed_at,
schema_version: EVIDENCE_SCHEMA_VERSION,
provenance: provFor(`${ROOT}/data/_kb/scrum_reviews.jsonl`, lineNo),
model_name: row.accepted_model,
model_role: "executor" as ModelRole,
source_files: [row.file],
success_markers: row.accepted_on_attempt ? [`accepted_on_attempt_${row.accepted_on_attempt}`] : undefined,
text: row.suggestions_preview,
}),
},
{
source_file: `${ROOT}/data/_kb/observer_escalations.jsonl`,
transform: (row: any, lineNo: number) => ({
run_id: `obs_esc:${new Date(row.ts).getTime()}:${row.sig_hash}`,
task_id: `observer_escalation:${row.cluster_endpoint ?? "?"}`,
timestamp: row.ts,
schema_version: EVIDENCE_SCHEMA_VERSION,
provenance: provFor(`${ROOT}/data/_kb/observer_escalations.jsonl`, lineNo, row.sig_hash),
model_role: "reviewer" as ModelRole,
prompt_tokens: row.prompt_tokens,
completion_tokens: row.completion_tokens,
text: row.analysis,
}),
},
{
source_file: `${ROOT}/data/_kb/audit_facts.jsonl`,
transform: (row: any, lineNo: number) => ({
run_id: `audit_facts:${row.head_sha}:${lineNo}`,
task_id: `pr:${row.pr_number}`,
timestamp: row.extracted_at,
schema_version: EVIDENCE_SCHEMA_VERSION,
provenance: provFor(`${ROOT}/data/_kb/audit_facts.jsonl`, lineNo),
model_name: row.extractor,
model_role: "extractor" as ModelRole,
// facts/entities/relationships go into text as a JSON dump for now;
// structured handling lives in Phase 2 where we map to specific
// EvidenceRecord substructures.
text: JSON.stringify({
facts: row.facts?.length ?? 0,
entities: row.entities?.length ?? 0,
relationships: row.relationships?.length ?? 0,
}),
}),
},
];
interface ProbeResult {
source_file: string;
rows_attempted: number;
rows_present: boolean;
passed: number;
failed: number;
failure_reasons: string[]; // unique error strings, top 5
}
const RESULTS: ProbeResult[] = [];
for (const probe of PROBES) {
const sourceLabel = probe.source_file.replace(`${ROOT}/`, "");
test(`real-data: ${sourceLabel}`, () => {
const result: ProbeResult = {
source_file: sourceLabel,
rows_attempted: 0,
rows_present: false,
passed: 0,
failed: 0,
failure_reasons: [],
};
if (!existsSync(probe.source_file)) {
RESULTS.push(result);
// Skip silently — fresh clones won't have these files
return;
}
result.rows_present = true;
const lines = readFileSync(probe.source_file, "utf8").split("\n").filter(Boolean).slice(0, SAMPLE_PER_SOURCE);
const reasons = new Set<string>();
for (let i = 0; i < lines.length; i++) {
result.rows_attempted++;
let row: unknown;
try { row = JSON.parse(lines[i]); }
catch { continue; }
const transformed = probe.transform(row, i);
if (!transformed) continue;
const v = validateEvidenceRecord(transformed);
if (v.valid) result.passed++;
else {
result.failed++;
for (const e of v.errors) reasons.add(e);
}
}
result.failure_reasons = Array.from(reasons).slice(0, 5);
RESULTS.push(result);
// Test passes as long as we attempted something and got a result.
// Per-source pass/fail counts are reported in the markdown writeup.
expect(result.rows_attempted).toBeGreaterThanOrEqual(0);
});
}
test("real-data: emit markdown report", () => {
const md: string[] = [];
md.push("# Real-data validation report");
md.push("");
md.push("Schema = EvidenceRecord v" + EVIDENCE_SCHEMA_VERSION + ". Sample = first " + SAMPLE_PER_SOURCE + " rows per source.");
md.push("");
md.push("| Source | Present | Rows | Pass | Fail | Pass% |");
md.push("|---|---|---|---|---|---|");
for (const r of RESULTS) {
const pct = r.rows_attempted > 0 ? Math.round(100 * r.passed / r.rows_attempted) + "%" : "—";
md.push(`| ${r.source_file} | ${r.rows_present ? "✓" : "—"} | ${r.rows_attempted} | ${r.passed} | ${r.failed} | ${pct} |`);
}
md.push("");
let hasFailures = false;
for (const r of RESULTS) {
if (r.failed > 0) {
hasFailures = true;
md.push(`## Failures in ${r.source_file}`);
for (const reason of r.failure_reasons) md.push(`- \`${reason}\``);
md.push("");
}
}
if (!hasFailures) {
md.push("**No failures across all probed sources.** Every materialized record validates against EvidenceRecord v1.");
md.push("");
}
// Stale extraction probe: explicit pass/fail
const distilledFacts = RESULTS.find(r => r.source_file.endsWith("distilled_facts.jsonl"));
const distilledProc = RESULTS.find(r => r.source_file.endsWith("distilled_procedures.jsonl"));
md.push("## Stale-extraction probe");
md.push("");
if (distilledFacts && distilledFacts.rows_present && distilledFacts.passed > 0) {
md.push(`- **distilled_facts.jsonl:** ${distilledFacts.passed}/${distilledFacts.rows_attempted} materialize cleanly. Stream is alive at the schema level.`);
} else if (distilledFacts && !distilledFacts.rows_present) {
md.push(`- **distilled_facts.jsonl:** missing — stale or never produced. Phase 2 sources from live streams instead.`);
} else {
md.push(`- **distilled_facts.jsonl:** present but materialization failures; treat as suspect, prefer mode_experiments + scrum_reviews.`);
}
if (distilledProc && distilledProc.rows_present && distilledProc.passed > 0) {
md.push(`- **distilled_procedures.jsonl:** ${distilledProc.passed}/${distilledProc.rows_attempted} materialize cleanly.`);
}
md.push("");
// Write the markdown to a stable path and stdout
const out = md.join("\n");
Bun.write(`${ROOT}/data/_kb/realdata_validation_report.md`, out);
console.log("\n" + out);
});

View File

@ -0,0 +1,111 @@
// Receipt — per-pipeline-stage record with everything needed to
// reproduce the run. Spec non-negotiable: substantive receipts, not
// "ran successfully". Every field below has a deterministic source so
// the receipt schema validator catches "I forgot to fill it in" the
// same way it catches type errors.
import {
ValidationResult, requireString, requireNumber, requireIsoTimestamp,
} from "./types";
export const RECEIPT_SCHEMA_VERSION = 1;
export interface FileReference {
path: string; // relative to repo root
sha256: string; // hex
bytes?: number; // optional but recommended
}
export interface Receipt {
schema_version: number;
command: string; // shell-line or script identifier
git_sha: string; // 40-char hex (full SHA1)
git_branch?: string;
git_dirty?: boolean; // true if working tree had uncommitted changes
started_at: string; // ISO 8601
ended_at: string; // ISO 8601
duration_ms: number;
input_files: FileReference[];
output_files: FileReference[];
record_counts: {
in: number;
out: number;
[key: string]: number; // per-stage extras (filtered, dropped, etc.)
};
validation_pass: boolean; // explicit — never inferred
errors: string[];
warnings: string[];
}
function validateFileRef(v: unknown, field: string, errors: string[]): boolean {
if (typeof v !== "object" || v === null) {
errors.push(`${field}: expected object`);
return false;
}
const f = v as Record<string, unknown>;
let ok = true;
ok = requireString(f.path, `${field}.path`, errors) && ok;
if (typeof f.sha256 !== "string" || !/^[0-9a-f]{64}$/.test(f.sha256)) {
errors.push(`${field}.sha256: must be hex sha256`);
ok = false;
}
if (f.bytes !== undefined && typeof f.bytes !== "number") {
errors.push(`${field}.bytes: expected number when present`);
ok = false;
}
return ok;
}
export function validateReceipt(input: unknown): ValidationResult<Receipt> {
const errors: string[] = [];
if (typeof input !== "object" || input === null) {
return { valid: false, errors: ["expected object"] };
}
const r = input as Record<string, unknown>;
let ok = true;
if (r.schema_version !== RECEIPT_SCHEMA_VERSION) {
errors.push(`schema_version: expected ${RECEIPT_SCHEMA_VERSION}, got ${JSON.stringify(r.schema_version)}`);
ok = false;
}
ok = requireString(r.command, "command", errors) && ok;
if (typeof r.git_sha !== "string" || !/^[0-9a-f]{40}$/.test(r.git_sha as string)) {
errors.push("git_sha: must be 40-char hex");
ok = false;
}
ok = requireIsoTimestamp(r.started_at, "started_at", errors) && ok;
ok = requireIsoTimestamp(r.ended_at, "ended_at", errors) && ok;
ok = requireNumber(r.duration_ms, "duration_ms", errors) && ok;
if (typeof r.validation_pass !== "boolean") {
errors.push("validation_pass: must be boolean (explicit, never inferred)");
ok = false;
}
if (!Array.isArray(r.input_files)) {
errors.push("input_files: expected array");
ok = false;
} else {
for (let i = 0; i < r.input_files.length; i++) {
if (!validateFileRef(r.input_files[i], `input_files[${i}]`, errors)) ok = false;
}
}
if (!Array.isArray(r.output_files)) {
errors.push("output_files: expected array");
ok = false;
} else {
for (let i = 0; i < r.output_files.length; i++) {
if (!validateFileRef(r.output_files[i], `output_files[${i}]`, errors)) ok = false;
}
}
if (typeof r.record_counts !== "object" || r.record_counts === null) {
errors.push("record_counts: expected object");
ok = false;
} else {
const rc = r.record_counts as Record<string, unknown>;
if (typeof rc.in !== "number") { errors.push("record_counts.in: expected number"); ok = false; }
if (typeof rc.out !== "number") { errors.push("record_counts.out: expected number"); ok = false; }
}
if (!Array.isArray(r.errors)) { errors.push("errors: expected array"); ok = false; }
if (!Array.isArray(r.warnings)) { errors.push("warnings: expected array"); ok = false; }
if (!ok) return { valid: false, errors };
return { valid: true, value: r as unknown as Receipt };
}

View File

@ -0,0 +1,344 @@
// Combined schema tests for ScoredRun, Receipt, Playbook,
// ScratchpadSummary, ModelLedgerEntry, RagSample, SftSample,
// PreferenceSample. EvidenceRecord lives in its own file because it's
// the foundational schema and warrants the JSON-fixture round-trip
// pattern; the rest use inline fixture makers since they're simpler.
//
// Each schema: 1 positive fixture + 4-5 negative cases pinning the
// non-negotiable invariants from now.md.
//
// Run: bun test auditor/schemas/distillation/schemas.test.ts
import { test, expect } from "bun:test";
import { validateScoredRun, SCORED_RUN_SCHEMA_VERSION } from "./scored_run";
import { validateReceipt, RECEIPT_SCHEMA_VERSION } from "./receipt";
import { validatePlaybook, PLAYBOOK_SCHEMA_VERSION } from "./playbook";
import { validateScratchpadSummary, SCRATCHPAD_SCHEMA_VERSION } from "./scratchpad_summary";
import { validateModelLedgerEntry, MODEL_LEDGER_SCHEMA_VERSION } from "./model_ledger";
import { validateRagSample, RAG_SAMPLE_SCHEMA_VERSION } from "./rag_sample";
import { validateSftSample, SFT_SAMPLE_SCHEMA_VERSION } from "./sft_sample";
import { validatePreferenceSample, PREFERENCE_SAMPLE_SCHEMA_VERSION } from "./preference_sample";
const NOW = "2026-04-26T22:30:00.000Z";
const SHA = "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef";
const GIT_SHA = "f753e11157eef753e11157eef753e11157eef753";
const PROVENANCE = {
source_file: "data/_kb/scored_runs.jsonl",
line_offset: 0,
sig_hash: SHA,
recorded_at: NOW,
};
// ─── ScoredRun ───────────────────────────────────────────────────────
const SCORED_RUN_OK = {
schema_version: SCORED_RUN_SCHEMA_VERSION,
evidence_run_id: "run-abc",
evidence_task_id: "task-abc",
category: "accepted",
reasons: ["cargo_green=true", "anchor_grounding=0.95"],
scored_at: NOW,
scorer_version: "v1.0.0",
sub_scores: { cargo_green: true, anchor_grounding: 0.95 },
provenance: PROVENANCE,
};
test("ScoredRun: positive validates", () => {
const r = validateScoredRun(SCORED_RUN_OK);
if (!r.valid) console.error(r.errors);
expect(r.valid).toBe(true);
});
test("ScoredRun: empty reasons rejected (every score needs a reason)", () => {
const r = validateScoredRun({ ...SCORED_RUN_OK, reasons: [] });
expect(r.valid).toBe(false);
if (!r.valid) expect(r.errors.some(e => e.includes("reasons"))).toBe(true);
});
test("ScoredRun: invalid category rejected", () => {
const r = validateScoredRun({ ...SCORED_RUN_OK, category: "maybe_ok" });
expect(r.valid).toBe(false);
if (!r.valid) expect(r.errors.some(e => e.includes("category"))).toBe(true);
});
test("ScoredRun: anchor_grounding > 1 rejected (must be in [0, 1])", () => {
const r = validateScoredRun({ ...SCORED_RUN_OK, sub_scores: { ...SCORED_RUN_OK.sub_scores, anchor_grounding: 1.5 } });
expect(r.valid).toBe(false);
if (!r.valid) expect(r.errors.some(e => e.includes("anchor_grounding"))).toBe(true);
});
// ─── Receipt ─────────────────────────────────────────────────────────
const RECEIPT_OK = {
schema_version: RECEIPT_SCHEMA_VERSION,
command: "bun run scripts/build_evidence_index.ts",
git_sha: GIT_SHA,
git_branch: "scrum/auto-apply-19814",
git_dirty: false,
started_at: NOW,
ended_at: NOW,
duration_ms: 1234,
input_files: [{ path: "data/_kb/scrum_reviews.jsonl", sha256: SHA, bytes: 448000 }],
output_files: [{ path: "data/evidence/2026/04/26/run.jsonl", sha256: SHA }],
record_counts: { in: 100, out: 95, filtered: 5 },
validation_pass: true,
errors: [],
warnings: [],
};
test("Receipt: positive validates", () => {
const r = validateReceipt(RECEIPT_OK);
if (!r.valid) console.error(r.errors);
expect(r.valid).toBe(true);
});
test("Receipt: bad git_sha rejected (must be 40-char hex)", () => {
const r = validateReceipt({ ...RECEIPT_OK, git_sha: "abc123" });
expect(r.valid).toBe(false);
if (!r.valid) expect(r.errors.some(e => e.includes("git_sha"))).toBe(true);
});
test("Receipt: validation_pass must be boolean (never inferred)", () => {
const r = validateReceipt({ ...RECEIPT_OK, validation_pass: "yes" });
expect(r.valid).toBe(false);
if (!r.valid) expect(r.errors.some(e => e.includes("validation_pass"))).toBe(true);
});
test("Receipt: file refs without proper sha256 rejected", () => {
const r = validateReceipt({ ...RECEIPT_OK, output_files: [{ path: "x", sha256: "short" }] });
expect(r.valid).toBe(false);
if (!r.valid) expect(r.errors.some(e => e.includes("sha256"))).toBe(true);
});
// ─── Playbook ────────────────────────────────────────────────────────
const PLAYBOOK_OK = {
schema_version: PLAYBOOK_SCHEMA_VERSION,
playbook_id: "pb-scrum-review-001",
task_type: "scrum_review",
problem_pattern: "Cargo workspace warning escalation after applier patch",
useful_context: ["pathway memory bug fingerprints for the file area"],
model_routing_path: ["x-ai/grok-4.1-fast"],
commands_worked: ["cargo check --workspace"],
commands_failed: [],
validation_steps: ["warning count must not increase"],
repo_files_touched: ["crates/queryd/src/service.rs"],
recovery_strategy: "git checkout -- file when cargo red",
known_failure_modes: ["unused import noise"],
escalation_threshold: "use kimi-k2:1t when isolation mode rejects 2 attempts",
acceptance_criteria: ["cargo green", "warning count stable", "rationale-diff aligned"],
source_run_ids: ["run-xyz", "run-abc"],
created_at: NOW,
provenance: PROVENANCE,
};
test("Playbook: positive validates", () => {
const r = validatePlaybook(PLAYBOOK_OK);
if (!r.valid) console.error(r.errors);
expect(r.valid).toBe(true);
});
test("Playbook: empty source_run_ids rejected (every playbook traces to source — spec)", () => {
const r = validatePlaybook({ ...PLAYBOOK_OK, source_run_ids: [] });
expect(r.valid).toBe(false);
if (!r.valid) expect(r.errors.some(e => e.includes("source_run_ids"))).toBe(true);
});
test("Playbook: empty acceptance_criteria rejected (every playbook needs success criteria — spec)", () => {
const r = validatePlaybook({ ...PLAYBOOK_OK, acceptance_criteria: [] });
expect(r.valid).toBe(false);
if (!r.valid) expect(r.errors.some(e => e.includes("acceptance_criteria"))).toBe(true);
});
// ─── ScratchpadSummary ───────────────────────────────────────────────
const SCRATCHPAD_OK = {
schema_version: SCRATCHPAD_SCHEMA_VERSION,
run_id: "run-abc",
current_objective: "verify pr_audit mode end-to-end",
completed_steps: ["restart gateway"],
failed_steps: ["cloud chat returned 500"],
pending_steps: ["swap default model"],
important_paths: ["auditor/checks/inference.ts"],
decisions: ["defer kimi-k2 swap until upstream returns"],
unresolved_questions: ["does deepseek match kimi quality?"],
validation_status: "partial",
next_command: "bun run auditor/audit_one.ts 11",
source_scratchpad_hash: SHA,
summarized_at: NOW,
provenance: PROVENANCE,
};
test("ScratchpadSummary: positive validates", () => {
const r = validateScratchpadSummary(SCRATCHPAD_OK);
if (!r.valid) console.error(r.errors);
expect(r.valid).toBe(true);
});
test("ScratchpadSummary: invalid validation_status rejected", () => {
const r = validateScratchpadSummary({ ...SCRATCHPAD_OK, validation_status: "tbd" });
expect(r.valid).toBe(false);
if (!r.valid) expect(r.errors.some(e => e.includes("validation_status"))).toBe(true);
});
test("ScratchpadSummary: short scratchpad_hash rejected", () => {
const r = validateScratchpadSummary({ ...SCRATCHPAD_OK, source_scratchpad_hash: "short" });
expect(r.valid).toBe(false);
if (!r.valid) expect(r.errors.some(e => e.includes("source_scratchpad_hash"))).toBe(true);
});
// ─── ModelLedgerEntry ────────────────────────────────────────────────
const LEDGER_OK = {
schema_version: MODEL_LEDGER_SCHEMA_VERSION,
model_name: "kimi-k2:1t",
model_provider: "ollama_cloud",
task_type: "pr_audit",
success_rate: 0.85,
failure_modes: ["upstream_500", "context_truncation"],
best_partner_model: "x-ai/grok-4.1-fast",
escalation_role: "primary",
cost_usd_p50: 0.0002,
latency_ms_p50: 50000,
latency_ms_p95: 90000,
context_window: 200000,
sample_count: 47,
last_updated: NOW,
};
test("ModelLedgerEntry: positive validates", () => {
const r = validateModelLedgerEntry(LEDGER_OK);
if (!r.valid) console.error(r.errors);
expect(r.valid).toBe(true);
});
test("ModelLedgerEntry: success_rate > 1 rejected", () => {
const r = validateModelLedgerEntry({ ...LEDGER_OK, success_rate: 1.5 });
expect(r.valid).toBe(false);
if (!r.valid) expect(r.errors.some(e => e.includes("success_rate"))).toBe(true);
});
test("ModelLedgerEntry: zero sample_count rejected (no aggregate from zero)", () => {
const r = validateModelLedgerEntry({ ...LEDGER_OK, sample_count: 0 });
expect(r.valid).toBe(false);
if (!r.valid) expect(r.errors.some(e => e.includes("sample_count"))).toBe(true);
});
// ─── RagSample ───────────────────────────────────────────────────────
const RAG_OK = {
schema_version: RAG_SAMPLE_SCHEMA_VERSION,
id: "rag-pb-001",
title: "Scrum applier rationale-diff alignment",
content: "When the applier emits a patch with rationale claiming X but the diff shows Y, the rationale-token alignment gate catches it...",
tags: ["scrum_review", "applier"],
source_run_id: "run-xyz",
success_score: "accepted",
embedding_text: "applier rationale-diff alignment guard scrum",
exported_at: NOW,
provenance: PROVENANCE,
};
test("RagSample: positive validates", () => {
const r = validateRagSample(RAG_OK);
if (!r.valid) console.error(r.errors);
expect(r.valid).toBe(true);
});
test("RagSample: success_score=rejected forbidden (RAG only takes accepted+partial)", () => {
const r = validateRagSample({ ...RAG_OK, success_score: "rejected" });
expect(r.valid).toBe(false);
if (!r.valid) expect(r.errors.some(e => e.includes("success_score"))).toBe(true);
});
test("RagSample: whitespace-only content rejected", () => {
const r = validateRagSample({ ...RAG_OK, content: " \n " });
expect(r.valid).toBe(false);
if (!r.valid) expect(r.errors.some(e => e.includes("content"))).toBe(true);
});
// ─── SftSample (the strict one) ──────────────────────────────────────
const SFT_OK = {
schema_version: SFT_SAMPLE_SCHEMA_VERSION,
instruction: "Audit this PR diff against ship-claims.",
context: "claims: 3 strong, 2 moderate",
response: "{\"claim_verdicts\": [...]}",
source_run_id: "run-pr11",
quality_score: "accepted",
exported_at: NOW,
provenance: PROVENANCE,
};
test("SftSample: positive validates", () => {
const r = validateSftSample(SFT_OK);
if (!r.valid) console.error(r.errors);
expect(r.valid).toBe(true);
});
test("SftSample: quality_score=partially_accepted REJECTED (spec non-negotiable, no leak)", () => {
const r = validateSftSample({ ...SFT_OK, quality_score: "partially_accepted" });
expect(r.valid).toBe(false);
if (!r.valid) expect(r.errors.some(e => e.includes("quality_score"))).toBe(true);
});
test("SftSample: quality_score=rejected REJECTED (spec non-negotiable, no leak)", () => {
const r = validateSftSample({ ...SFT_OK, quality_score: "rejected" });
expect(r.valid).toBe(false);
});
test("SftSample: quality_score=needs_human_review REJECTED (no leak)", () => {
const r = validateSftSample({ ...SFT_OK, quality_score: "needs_human_review" });
expect(r.valid).toBe(false);
});
test("SftSample: empty response rejected (no empty pairs)", () => {
const r = validateSftSample({ ...SFT_OK, response: "" });
expect(r.valid).toBe(false);
if (!r.valid) expect(r.errors.some(e => e.includes("response"))).toBe(true);
});
test("SftSample: whitespace-only instruction rejected", () => {
const r = validateSftSample({ ...SFT_OK, instruction: " \t\n " });
expect(r.valid).toBe(false);
if (!r.valid) expect(r.errors.some(e => e.includes("instruction"))).toBe(true);
});
// ─── PreferenceSample ────────────────────────────────────────────────
const PREF_OK = {
schema_version: PREFERENCE_SAMPLE_SCHEMA_VERSION,
prompt: "Verify claim: 'all 3 services running on matrix-test'",
chosen: "{\"backed\": true, \"evidence\": \"systemctl status confirms 3 active\"}",
rejected: "{\"backed\": true, \"evidence\": \"the README says so\"}",
reason: "chosen cites runtime evidence, rejected cites doc claim only",
source_run_ids: { chosen: "run-A", rejected: "run-B" },
exported_at: NOW,
provenance: PROVENANCE,
};
test("PreferenceSample: positive validates", () => {
const r = validatePreferenceSample(PREF_OK);
if (!r.valid) console.error(r.errors);
expect(r.valid).toBe(true);
});
test("PreferenceSample: chosen == rejected rejected (no self-pairing)", () => {
const r = validatePreferenceSample({ ...PREF_OK, chosen: "x", rejected: "x" });
expect(r.valid).toBe(false);
if (!r.valid) expect(r.errors.some(e => e.includes("chosen and rejected"))).toBe(true);
});
test("PreferenceSample: source_run_ids both equal rejected", () => {
const r = validatePreferenceSample({ ...PREF_OK, source_run_ids: { chosen: "run-A", rejected: "run-A" } });
expect(r.valid).toBe(false);
if (!r.valid) expect(r.errors.some(e => e.includes("source_run_ids"))).toBe(true);
});
test("PreferenceSample: empty reason rejected (every preference needs WHY)", () => {
const r = validatePreferenceSample({ ...PREF_OK, reason: " " });
expect(r.valid).toBe(false);
if (!r.valid) expect(r.errors.some(e => e.includes("reason"))).toBe(true);
});

View File

@ -0,0 +1,86 @@
// ScoredRun — output of the deterministic Success Scorer (Phase 3).
// Spec mandates 4 categories with explicit reasons; we add scorer
// versioning so a future scorer change is detectable in historical data.
import {
ValidationResult, requireString, requireIsoTimestamp, requireProvenance, requireStringArray, requireNumber,
} from "./types";
export const SCORED_RUN_SCHEMA_VERSION = 1;
export const SCORE_CATEGORIES = ["accepted", "partially_accepted", "rejected", "needs_human_review"] as const;
export type ScoreCategory = (typeof SCORE_CATEGORIES)[number];
export interface ScoredRun {
schema_version: number;
evidence_run_id: string; // FK to EvidenceRecord.run_id
evidence_task_id: string; // FK to EvidenceRecord.task_id
category: ScoreCategory;
reasons: string[]; // human-readable, e.g. ["cargo_green=true", "anchor_grounding<0.7"]
scored_at: string; // ISO 8601
scorer_version: string; // e.g. "v1.0.0" — bumped on scorer code change
// Sub-scores that the scorer collapsed into the category. Persisted
// so a downstream UI can show "why" without re-running the scorer.
sub_scores?: {
cargo_green?: boolean;
anchor_grounding?: number;
schema_valid?: boolean;
pathway_replay_succeeded?: boolean;
observer_verdict?: "accept" | "reject" | "cycle";
[key: string]: unknown;
};
provenance: {
source_file: string;
line_offset?: number;
sig_hash: string;
recorded_at: string;
};
}
export function validateScoredRun(input: unknown): ValidationResult<ScoredRun> {
const errors: string[] = [];
if (typeof input !== "object" || input === null) {
return { valid: false, errors: ["expected object"] };
}
const r = input as Record<string, unknown>;
let ok = true;
if (r.schema_version !== SCORED_RUN_SCHEMA_VERSION) {
errors.push(`schema_version: expected ${SCORED_RUN_SCHEMA_VERSION}, got ${JSON.stringify(r.schema_version)}`);
ok = false;
}
ok = requireString(r.evidence_run_id, "evidence_run_id", errors) && ok;
ok = requireString(r.evidence_task_id, "evidence_task_id", errors) && ok;
ok = requireIsoTimestamp(r.scored_at, "scored_at", errors) && ok;
ok = requireString(r.scorer_version, "scorer_version", errors) && ok;
ok = requireStringArray(r.reasons, "reasons", errors) && ok;
if (Array.isArray(r.reasons) && r.reasons.length === 0) {
errors.push("reasons: must be non-empty (every score must have at least one reason)");
ok = false;
}
if (!SCORE_CATEGORIES.includes(r.category as ScoreCategory)) {
errors.push(`category: must be one of ${SCORE_CATEGORIES.join("|")}, got ${JSON.stringify(r.category)}`);
ok = false;
}
ok = requireProvenance(r.provenance, "provenance", errors) && ok;
if (r.sub_scores !== undefined) {
if (typeof r.sub_scores !== "object" || r.sub_scores === null) {
errors.push("sub_scores: expected object when present");
ok = false;
} else {
const ss = r.sub_scores as Record<string, unknown>;
if (ss.anchor_grounding !== undefined) {
if (!requireNumber(ss.anchor_grounding, "sub_scores.anchor_grounding", errors)) ok = false;
else if ((ss.anchor_grounding as number) < 0 || (ss.anchor_grounding as number) > 1) {
errors.push("sub_scores.anchor_grounding: must be in [0, 1]");
ok = false;
}
}
if (ss.observer_verdict !== undefined && !["accept", "reject", "cycle"].includes(ss.observer_verdict as string)) {
errors.push("sub_scores.observer_verdict: must be accept|reject|cycle");
ok = false;
}
}
}
if (!ok) return { valid: false, errors };
return { valid: true, value: r as unknown as ScoredRun };
}

View File

@ -0,0 +1,65 @@
// ScratchpadSummary — structured normalization of a tree-split or
// long-running scratchpad. Distinct from EvidenceRecord because a
// scratchpad accumulates across many calls; this schema captures the
// state at a checkpoint moment.
import {
ValidationResult, requireString, requireIsoTimestamp, requireProvenance, requireStringArray,
} from "./types";
export const SCRATCHPAD_SCHEMA_VERSION = 1;
export interface ScratchpadSummary {
schema_version: number;
run_id: string;
current_objective: string;
completed_steps: string[];
failed_steps: string[];
pending_steps: string[];
important_paths: string[]; // file paths the scratchpad references
decisions: string[]; // architectural/scope decisions made
unresolved_questions: string[];
validation_status: "pass" | "fail" | "partial" | "pending";
next_command?: string; // recommendation for next action
source_scratchpad_hash: string; // sha256 of the full source scratchpad text — diff detection
summarized_at: string; // ISO 8601
provenance: { source_file: string; line_offset?: number; sig_hash: string; recorded_at: string };
}
const STATUS = ["pass", "fail", "partial", "pending"];
export function validateScratchpadSummary(input: unknown): ValidationResult<ScratchpadSummary> {
const errors: string[] = [];
if (typeof input !== "object" || input === null) return { valid: false, errors: ["expected object"] };
const r = input as Record<string, unknown>;
let ok = true;
if (r.schema_version !== SCRATCHPAD_SCHEMA_VERSION) {
errors.push(`schema_version: expected ${SCRATCHPAD_SCHEMA_VERSION}, got ${JSON.stringify(r.schema_version)}`);
ok = false;
}
ok = requireString(r.run_id, "run_id", errors) && ok;
ok = requireString(r.current_objective, "current_objective", errors) && ok;
ok = requireIsoTimestamp(r.summarized_at, "summarized_at", errors) && ok;
if (typeof r.source_scratchpad_hash !== "string" || !/^[0-9a-f]{64}$/.test(r.source_scratchpad_hash as string)) {
errors.push("source_scratchpad_hash: must be hex sha256");
ok = false;
}
ok = requireStringArray(r.completed_steps, "completed_steps", errors) && ok;
ok = requireStringArray(r.failed_steps, "failed_steps", errors) && ok;
ok = requireStringArray(r.pending_steps, "pending_steps", errors) && ok;
ok = requireStringArray(r.important_paths, "important_paths", errors) && ok;
ok = requireStringArray(r.decisions, "decisions", errors) && ok;
ok = requireStringArray(r.unresolved_questions, "unresolved_questions", errors) && ok;
if (!STATUS.includes(r.validation_status as string)) {
errors.push(`validation_status: must be one of ${STATUS.join("|")}`);
ok = false;
}
if (r.next_command !== undefined && typeof r.next_command !== "string") {
errors.push("next_command: expected string when present");
ok = false;
}
ok = requireProvenance(r.provenance, "provenance", errors) && ok;
if (!ok) return { valid: false, errors };
return { valid: true, value: r as unknown as ScratchpadSummary };
}

View File

@ -0,0 +1,59 @@
// SftSample — entry in exports/sft/instruction_response.jsonl. Spec
// non-negotiable: ONLY accepted runs, never partial/rejected/needs_human.
// Validator enforces that invariant — exporters can't bypass.
import {
ValidationResult, requireString, requireIsoTimestamp, requireProvenance, requireNumber,
} from "./types";
export const SFT_SAMPLE_SCHEMA_VERSION = 1;
export interface SftSample {
schema_version: number;
instruction: string; // the prompt / user message
context?: string; // optional retrieved context that was visible
response: string; // the model output that was accepted
source_run_id: string;
quality_score: "accepted"; // hard-pinned — see validator
exported_at: string;
provenance: { source_file: string; line_offset?: number; sig_hash: string; recorded_at: string };
}
export function validateSftSample(input: unknown): ValidationResult<SftSample> {
const errors: string[] = [];
if (typeof input !== "object" || input === null) return { valid: false, errors: ["expected object"] };
const r = input as Record<string, unknown>;
let ok = true;
if (r.schema_version !== SFT_SAMPLE_SCHEMA_VERSION) {
errors.push(`schema_version: expected ${SFT_SAMPLE_SCHEMA_VERSION}, got ${JSON.stringify(r.schema_version)}`);
ok = false;
}
ok = requireString(r.instruction, "instruction", errors) && ok;
ok = requireString(r.response, "response", errors) && ok;
ok = requireString(r.source_run_id, "source_run_id", errors) && ok;
ok = requireIsoTimestamp(r.exported_at, "exported_at", errors) && ok;
ok = requireProvenance(r.provenance, "provenance", errors) && ok;
// Empty pair guard.
if (typeof r.instruction === "string" && (r.instruction as string).trim().length === 0) {
errors.push("instruction: must be non-whitespace (no empty pairs)");
ok = false;
}
if (typeof r.response === "string" && (r.response as string).trim().length === 0) {
errors.push("response: must be non-whitespace (no empty pairs)");
ok = false;
}
// The non-negotiable: SFT samples MUST have quality_score=accepted.
// Anything else is a leak from rejected/partial/needs_human into SFT.
if (r.quality_score !== "accepted") {
errors.push(`quality_score: must be 'accepted' (no rejected/partial/needs_human leak into SFT — spec non-negotiable). Got ${JSON.stringify(r.quality_score)}`);
ok = false;
}
if (r.context !== undefined && typeof r.context !== "string") {
errors.push("context: expected string when present");
ok = false;
}
if (!ok) return { valid: false, errors };
return { valid: true, value: r as unknown as SftSample };
}

View File

@ -0,0 +1,141 @@
// Shared types for distillation schemas. Hand-rolled validators (no Zod
// dependency) — bun:test runs them; runtime cost is one tiny function
// per record. Pattern: each schema exports `validate(x): ValidationResult`
// returning `{valid: true, value}` or `{valid: false, errors}`.
//
// Why hand-rolled: the auditor + scrum + observer pipelines emit JSONL
// rows in shapes that already work; we want to ENFORCE those shapes
// without adding a 100KB dependency or rewriting producers. The
// validators codify what we already produce.
//
// Naming: schemas live as nouns (`EvidenceRecord`), validators as
// `validate<Noun>`. Each schema file exports both the type and the
// validator.
export interface Provenance {
// Path to the JSONL or other source where this row came from. Always
// relative to /home/profit/lakehouse so receipts are reproducible
// across deploys with the same repo layout.
source_file: string;
// Optional byte offset / line number into the source file. Lets a
// future "open the source row" UI jump directly to the line. Some
// sources (single-row JSON files like _playbook_lessons/*.json) don't
// need this.
line_offset?: number;
// SHA-256 of the canonical JSON of the source row (sorted keys, no
// whitespace). This is the dedup key — running distillation twice on
// the same source produces identical sig_hash, so duplicates are
// detectable without full row comparison.
sig_hash: string;
// ISO 8601 of when this provenance link was recorded — usually the
// moment the unified Evidence Index ran. Distinct from the source
// row's own timestamp, which lives on the EvidenceRecord itself.
recorded_at: string;
}
// Returned by every schema validator. The shape is `{valid: true, value}`
// for success (so callers can use `value` with the right type narrowed)
// or `{valid: false, errors}` for failure (so callers can surface
// every error at once, not just the first).
export type ValidationResult<T> =
| { valid: true; value: T }
| { valid: false; errors: string[] };
// Standard helpers used by every schema. Centralized so naming +
// error message format stay consistent across schemas.
export function requireString(v: unknown, field: string, errors: string[]): v is string {
if (typeof v !== "string") {
errors.push(`${field}: expected string, got ${typeof v}`);
return false;
}
if (v.length === 0) {
errors.push(`${field}: must be non-empty`);
return false;
}
return true;
}
export function requireNumber(v: unknown, field: string, errors: string[]): v is number {
if (typeof v !== "number" || !Number.isFinite(v)) {
errors.push(`${field}: expected finite number, got ${typeof v}`);
return false;
}
return true;
}
export function requireIsoTimestamp(v: unknown, field: string, errors: string[]): v is string {
if (!requireString(v, field, errors)) return false;
// Permissive ISO 8601: YYYY-MM-DDTHH:MM:SS(.fraction)?(Z|±HH:MM)?
const re = /^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(?:\.\d+)?(?:Z|[+-]\d{2}:?\d{2})?$/;
if (!re.test(v as string)) {
errors.push(`${field}: not a valid ISO 8601 timestamp: ${(v as string).slice(0, 60)}`);
return false;
}
return true;
}
export function requireSha256(v: unknown, field: string, errors: string[]): v is string {
if (!requireString(v, field, errors)) return false;
if (!/^[0-9a-f]{64}$/.test(v as string)) {
errors.push(`${field}: not a valid hex sha256: ${(v as string).slice(0, 80)}`);
return false;
}
return true;
}
export function requireProvenance(v: unknown, field: string, errors: string[]): v is Provenance {
if (typeof v !== "object" || v === null) {
errors.push(`${field}: expected object, got ${v === null ? "null" : typeof v}`);
return false;
}
const p = v as Record<string, unknown>;
let ok = true;
ok = requireString(p.source_file, `${field}.source_file`, errors) && ok;
ok = requireSha256(p.sig_hash, `${field}.sig_hash`, errors) && ok;
ok = requireIsoTimestamp(p.recorded_at, `${field}.recorded_at`, errors) && ok;
if (p.line_offset !== undefined && typeof p.line_offset !== "number") {
errors.push(`${field}.line_offset: expected number when present`);
ok = false;
}
return ok;
}
export function requireStringArray(v: unknown, field: string, errors: string[]): v is string[] {
if (!Array.isArray(v)) {
errors.push(`${field}: expected array, got ${typeof v}`);
return false;
}
for (let i = 0; i < v.length; i++) {
if (typeof v[i] !== "string") {
errors.push(`${field}[${i}]: expected string, got ${typeof v[i]}`);
return false;
}
}
return true;
}
// Compute the canonical sha256 used for sig_hash. Sorts keys so the
// hash is stable regardless of producer's serialization order. Uses
// Bun.CryptoHasher (sync, fast) rather than node:crypto — matches the
// rest of the auditor.
export async function canonicalSha256(obj: unknown): Promise<string> {
const ordered = orderKeys(obj);
const json = JSON.stringify(ordered);
const hasher = new Bun.CryptoHasher("sha256");
hasher.update(json);
return hasher.digest("hex");
}
function orderKeys(v: unknown): unknown {
if (v === null || typeof v !== "object") return v;
if (Array.isArray(v)) return v.map(orderKeys);
const out: Record<string, unknown> = {};
for (const k of Object.keys(v as object).sort()) {
out[k] = orderKeys((v as Record<string, unknown>)[k]);
}
return out;
}

View File

@ -0,0 +1,309 @@
# Local Distillation Pipeline — Repo Recon
**Date:** 2026-04-26
**Status:** Phase 0 (read-only inventory — no implementation yet)
**Spec:** `/home/profit/now.md`
**Branch:** `scrum/auto-apply-19814` head `f753e11` (uncommitted: auditor rebuild)
This document inventories what already exists in the Lakehouse repo before we build the distillation substrate. It is the gating artifact: per the spec, no implementation lands until this document is settled.
The headline finding: **~70% of the spec's modules already have working substrate** in the form of JSONL streams, vector corpora, scoring gates, and a partial extraction pipeline (`distilled_facts.jsonl` / `distilled_procedures.jsonl`). The work is integration + formalization, not greenfield. The biggest risk is shipping a parallel system that drifts from what the existing scrum/auditor/observer loops actually produce.
---
## 1. Repo structure
```
/home/profit/lakehouse
├── crates/ # 15 Rust crates (see PRD.md)
│ ├── shared/ # types.rs, profiles/, model_matrix.rs, secrets.rs
│ ├── gateway/ # /v1/* HTTP surface, mode router, observer event fanout
│ ├── vectord/ # HNSW + pathway_memory.rs (88 traces) + Mem0 versioning
│ ├── catalogd/ # column types, manifests
│ ├── truth/ # Phase 42 — TOML rule engine for SQL/request gates
│ ├── validator/ # Phase 43 — staffing/devops validators
│ └── ...
├── auditor/ # TypeScript PR auditor (Bun runtime)
│ ├── audit.ts # orchestrator
│ ├── audit_one.ts # one-shot harness
│ ├── claim_parser.ts # extracts ship-claims from PR body
│ ├── fact_extractor.ts # LLM Team /api/run?mode=extract integration
│ ├── kb_index.ts # **already a queryable index over data/_kb/*.jsonl**
│ ├── kb_stats.ts
│ ├── checks/ # static.ts, dynamic.ts, inference.ts, kb_query.ts
│ ├── policy.ts # severity → block/warn/info gates
│ └── gitea.ts # PR poller
├── tests/
│ ├── real-world/ # scrum_master_pipeline.ts, scrum_applier.ts, runs/
│ ├── multi-agent/ # scenarios/, playbooks/
│ ├── architecture_smoke.ts
│ ├── battery/
│ └── agent_test/
├── scripts/
│ ├── build_answers_corpus.ts # NEW 2026-04-26: lakehouse_answers_v1
│ ├── build_lakehouse_corpus.ts # arch corpus
│ ├── build_symbols_corpus.ts # symbols corpus
│ ├── build_scrum_findings_corpus.ts # findings corpus
│ ├── vectorize_raw_corpus.ts
│ ├── mode_experiment.ts / mode_compare.ts / mode_pass{2,3,4,5}_*.ts
│ └── ...
├── sidecar/ # Python (Ollama embed adapter)
├── mcp-server/ # observer.ts, relevance.ts, ai_models.ts (port 3700/3800)
├── ui/ # public Bun UI (devop.live/lakehouse, port 3700)
├── data/ # **the substrate this document audits** (see §3)
└── docs/
├── PRD.md
├── PHASES.md
├── DECISIONS.md (ADRs 001-021)
├── SCRUM_MASTER_SPEC.md
├── MATRIX_AGENT_HANDOVER.md
├── MODE_RUNNER_TUNING_PLAN.md
└── recon/
└── local-distillation-recon.md (this file)
```
---
## 2. Existing components by spec module
### 2.1 Gateway / orchestrator
`crates/gateway/src/v1/mode.rs` is the **prompt-molder substrate** — task_class → mode → enrichment composer + LLM call. Five native modes (codereview_lakehouse, codereview_isolation, codereview_null, codereview_matrix_only, codereview_playbook_only) + staffing_inference_lakehouse + (uncommitted) pr_audit. Strong-model auto-downgrade gate based on Pass 5 variance test.
**Distillation relevance:** The mode runner already encodes "compose pathway memory + matrix retrieval + framing into a one-shot prompt." The distillation pipeline can call mode runner endpoints rather than reimplementing retrieval.
### 2.2 Observer / scratchpad
- `mcp-server/observer.ts` — Bun service on `:3800`. `/event`, `/relevance`, `/review` endpoints. Receives scrum + scenario + langfuse-bridge sources. KB preamble blends pathway + arch + answers (3 sources).
- `mcp-server/relevance.ts` — adjacency-pollution heuristic filter (added 2026-04-25).
- Scrum scratchpad: `tests/real-world/scrum_master_pipeline.ts::treeSplitFile` — text-only multi-shard scratchpad. Auditor curates the same way (`auditor/checks/inference.ts::treeSplitDiff`).
**Distillation relevance:** Scratchpads are unstructured text. Spec wants structured extraction (objective/completed/failed/pending). The `distilled_*.jsonl` streams (§3) already do half of this for the LLM Team runs — extending to the scrum scratchpad is the gap.
### 2.3 Knowledge base / index
Two layers:
**Layer 1: append-only JSONL streams in `data/_kb/`** (see §3 for full inventory).
**Layer 2: vector corpora in `data/vectors/*.parquet`** + HNSW indexes.
Auditor's `kb_index.ts` already wraps the JSONLs as a queryable index. `kb_query.ts` check uses it to surface recurring patterns across PRs.
**Distillation relevance:** Layer 1 is the EvidenceCollector substrate. Layer 2 is the HybridIndexer substrate. Neither has a unified record schema across streams — that's the formalization work.
### 2.4 MCP / context integrations
- `.mcp.json` — MCP server configuration (gitea, etc.)
- `mcp-server/` — observer + relevance + ai_models surfaces
- LLM Team UI (port 5000) — `/api/run?mode=extract` is the only registered mode (per `feedback_endpoint_probe_discipline.md`); `code_review/patch/refactor` return "Unknown mode"
- `aibridge` crate — Rust ↔ Python sidecar; OpenAI-compat proxy as of `3a0b37e`
**Distillation relevance:** Existing call surfaces are already the right shape. Distillation pipeline runs ON the gateway via `/v1/*`, not on a parallel runtime.
### 2.5 PRD / requirements docs
- `docs/PRD.md` — phases 0-37 (shipped) + 38-44 productization
- `docs/CONTROL_PLANE_PRD.md` — long-horizon control plane (2026-04-22 pivot)
- `docs/PHASES.md` — phase tracker
- `docs/DECISIONS.md` — ADRs 001-021 (021 is semantic-correctness matrix layer)
- `docs/SCRUM_MASTER_SPEC.md` — scrum loop architecture + refactor timeline
- `docs/MODE_RUNNER_TUNING_PLAN.md` — open knobs
**Distillation relevance:** PRD is the ground truth for the PRD-drift comparator. PHASES.md + auditor's `phase_sweep_findings.jsonl` already encode partial drift reports.
### 2.6 Model routing logic
- `config/modes.toml` — task_class → mode/model registry (6 task classes including new pr_audit)
- `crates/gateway/src/v1/mode.rs::is_weak_model` — strong/weak heuristic for matrix corpus downgrade
- `data/_kb/model_trust.jsonl` (45K) — per-run model performance ledger (run_id, accepted_model, attempts_made, etc.)
- `data/_kb/mode_experiments.jsonl` (1.3M) — per-call mode runner telemetry (mode, model, latency_ms, sources, response, response_chars)
**Distillation relevance:** `mode_experiments.jsonl` is the cleanest per-call record we have — it's already an EvidenceRecord with everything except observer_notes and human_override fields. The Model Routing Ledger spec module is mostly an aggregation script over this jsonl + model_trust.jsonl.
### 2.7 Logs / traces
- Langfuse (port 3001, docker `langfuse`) — every `/v1/chat` and `/v1/respond` call (`crates/gateway/src/v1/langfuse_trace.rs`). Fire-and-forget.
- Observer `/event` — every `/v1/chat` call also fires here (`d1d97a0`)
- `data/_observer/ops.jsonl` — observer event log (mcp-server side)
- `data/_auditor/verdicts/*.json` — per-PR auditor verdict
- Systemd journals: lakehouse, lakehouse-sidecar, lakehouse-observer, lakehouse-auditor
**Distillation relevance:** Langfuse + observer events are the trace substrate, but they're not yet linked to the JSONL streams via shared run_id. Linkage is part of EvidenceRecord work.
### 2.8 Test framework
- Bun-native tests in `crates/*/src/**/*test*` (Rust) and `tests/*` (TypeScript)
- `tests/real-world/` — scrum master + applier integration
- `tests/architecture_smoke.ts` — PRD-invariant probe against 500k workers
- `tests/multi-agent/scenarios/` — 20+ scenario fixtures (Heritage_Foods, Riverfront_Steel, etc.)
- `auditor/fixtures/hybrid_38_40_45.ts` — auditor's own dynamic fixture
**Distillation relevance:** Test framework supports both Rust and TS. The acceptance-gate suite (Phase 6 of distillation plan) lands in `tests/distillation/`.
### 2.9 Data schemas (existing, implicit)
The shapes that matter, by JSONL:
| File | Key fields | Provenance fields |
|------|-----------|-------------------|
| `audits.jsonl` (2.6M) | full per-PR verdict | `pr_number`, `head_sha`, `audited_at` |
| `audit_facts.jsonl` (506K) | extracted facts/entities/relationships from auditor inference | `pr_number`, `head_sha`, `extracted_at`, `extractor`, `verifier`, `llm_team_run_id` |
| `audit_lessons.jsonl` (539K) | derived lessons from past audits | (similar to facts) |
| `audit_discrepancies.jsonl` | N=3 consensus splits — chosen/rejected pairs | `pr_number`, `head_sha`, `claim_idx`, `votes`, `resolution` |
| `scrum_reviews.jsonl` (448K) | per-file scrum review (forensic JSON or markdown) | `file`, `reviewed_at`, `accepted_model`, `accepted_on_attempt` |
| `auto_apply.jsonl` (14K) | applier action per file | `file`, `ts`, `action`, `patches_applied` |
| `mode_experiments.jsonl` (1.3M) | per-call mode runner telemetry | `ts`, `task_class`, `mode`, `model`, `file_path`, `sources`, `latency_ms` |
| `observer_escalations.jsonl` (1.9K) | observer-diagnosed failure clusters | `ts`, `sig_hash`, `cluster_size`, `analysis`, `mode`, `kb_preamble_chars` |
| `observer_reviews.jsonl` (97K) | observer hand-reviews of scrum attempts | (TBD) |
| `model_trust.jsonl` (45K) | per-run model trust ledger | `run_id`, `task_type`, `accepted_model`, `attempts_made`, `confidence_avg`, `errors`, `thin_rejections` |
| `outcomes.jsonl` (98K) | per-run scenario outcomes | `run_id`, `sig_hash`, `created_at`, `models`, `total_events`, `ok_events`, `total_citations`, `total_gap_signals` |
| `human_overrides.jsonl` (2.4K) | human-in-loop overrides | (TBD) |
| `overseer_corrections.jsonl` (21K) | overseer model corrections | (TBD) |
| `phase_sweep_findings.jsonl` (45K) | phase-audit drift findings | `phase`, `phase_name`, `status`, `claims_verified`, `claims_fake`, `claims_partial`, `findings`, `evidence`, `discovered_at` |
| `doc_drift_corrections.jsonl` (603B) | doc drift signals | (TBD) |
| `pathway_recommendations.jsonl` (57K) | pathway memory hot-swap recommendations | `run_id` |
| `signatures.jsonl` (270K) | run signatures for dedup/grouping | (TBD) |
| `classifications.jsonl` (52K) | task-type classifications | (TBD — likely the task_type taxonomy) |
| `contract_analyses.jsonl` (4.3K) | contract analysis runs (closest to canonical EvidenceRecord) | `ts`, `ok`, `permit_id`, `analysis`, `matrix_corpora`, `matrix_hits`, `matrix_ms`, `observer_verdict`, `observer_conf`, `observer_notes`, `observer_src`, `cost`, `duration_ms` |
| `distilled_facts.jsonl` (179K) | **already-distilled fact stream** | `run_id`, `sig_hash`, `created_at`, `extractor`, `verifier`, `categorizer`, `category`, `text`, `embedding`, `embed_dim`, `schema_version`, `source_label`, `source_service` |
| `distilled_procedures.jsonl` (21K) | **already-distilled procedure stream** | (same shape as facts) |
| `distilled_config_hints.jsonl` (22K) | **already-distilled config-hint stream** | (same shape) |
---
## 3. The data substrate (what's already produced)
### Schema observation
`distilled_facts.jsonl` and `distilled_procedures.jsonl` already match what now.md calls a normalized evidence record — almost. They have:
✅ run_id, sig_hash (provenance + dedup)
✅ extractor, verifier, categorizer (deterministic role labels)
✅ schema_version (forward-compat)
✅ embedding pre-computed (already in HybridIndexer Layer 2!)
✅ category, source_label, source_service (taxonomy + origin)
✅ text (the distilled content)
❌ no observer_notes
❌ no commands_run / tool_calls
❌ no validation_results / failure_markers
❌ no human_override
So: **the `distilled_*` streams are an EvidenceRecord prototype, narrowed to LLM-extracted text.** Extending the schema to cover the missing fields (or sourcing them via JOIN to other streams) is the Phase 1 work.
`contract_analyses.jsonl` is the **other** prototype — it carries observer integration fields (verdict, confidence, notes, src) plus retrieval telemetry (matrix_corpora, matrix_hits, matrix_ms) plus per-call cost/duration. Different shape, but more complete in some axes.
The right move is to **reconcile both shapes** into a single schema rather than picking one.
### Vector corpora (HybridIndexer Layer 2)
20 corpora live in `data/vectors/*.parquet`:
- `lakehouse_arch_v1` — architecture corpus
- `lakehouse_symbols_v1` — symbol corpus (via tree-sitter or grep)
- `lakehouse_answers_v1` — gold-standard prior reviews + escalations (commit `0844206`)
- `scrum_findings_v1` — old, superseded by answers_v1
- `distilled_factual_v202604*`, `distilled_procedural_v202604*`, `distilled_config_hint_v202604*` — vectorized distilled streams
- `kb_team_runs_v1`, `kb_team_runs_agent`, `llm_team_runs_v1` — LLM Team artifact corpora
- `chicago_permits_v1`, `entity_brief_v1`, `ethereal_workers_v1`, `workers_500k_v8` — domain corpora
- `threat_intel_v1`, `sec_tickers_v1` — external
The hybrid retrieval pattern is established: `mode.rs` queries top_k from each named corpus, merges by score, takes top 8, drops via `/relevance`. **Keyword/BM25 is missing** (the spec asks for hybrid keyword + semantic) — but DataFusion in queryd can run substring/regex queries on the underlying Parquet, so the substrate is there.
---
## 4. Gap analysis (spec module → real gap)
| Spec module | What we have | Gap |
|------|------|-----|
| Evidence Collector | 23 source JSONLs, 2 prototype schemas (`distilled_*`, `contract_analyses`) | Unified `EvidenceRecord` schema spanning all sources + JOIN view by run_id/file/timestamp |
| Success Scorer | 5 scrum_applier gates, auditor verdicts, mode_compare grounding %, pathway replay rate, scrum verdict, observer accept/reject | Single deterministic function combining these into 4 categories with explicit reasons[] |
| Playbook Extractor | bug_fingerprints (semantic-correctness layer), `_playbook_memory/`, `_playbook_lessons/` (50+ JSON), distilled_procedures.jsonl | Full task-flow playbooks (model routing path + commands_run + recovery + escalation triggers); current playbooks are bug-pattern + staffing-fill, not procedural |
| Hybrid Indexer | 20 vector corpora + pathway_memory + auditor `kb_index.ts` | Keyword/BM25 layer; task-tag filters (the embedding side is solid) |
| Dataset Builder | nothing exporting in spec format | NET NEW — `build_rag_dataset.ts`, `build_sft_dataset.ts`, `build_preference_dataset.ts` |
| Scratchpad Normalizer | tree-split scratchpads (text), `distilled_*.jsonl` (LLM-extracted) | Structured normalization of scrum/auditor scratchpads into objective/completed/failed/pending JSON |
| PRD Drift Comparator | auditor inference + static + `phase_sweep_findings.jsonl` + `doc_drift_corrections.jsonl` | Per-repo-state snapshot (the existing pieces are per-PR or per-phase) |
| Model Routing Ledger | `model_trust.jsonl` + `mode_experiments.jsonl` + strong-model downgrade gate | Aggregated, queryable view by task_type × model_name |
| Receipts | per-call jsonl rows + auditor verdicts | Per-pipeline-stage `receipt.json` with git_sha + input/output hashes + record_counts |
---
## 5. Risks
1. **Drift from existing loops.** The scrum, auditor, and observer pipelines all write into the substrate. A distillation pipeline that defines its own EvidenceRecord without conforming to those producers' shapes will drift. Mitigation: derive `EvidenceRecord` schema from existing JSONL keys, formalize what's there before adding new fields.
2. **Over-distillation as theater.** It's tempting to "extract" content from raw runs without checking the existing distilled_facts/procedures already cover the run. Mitigation: dedup by `sig_hash` against existing distilled streams before extracting; emit pure pass-through rows when source already has a distilled twin.
3. **Stale extraction.** `distilled_facts.jsonl` was last touched 2026-04-23 — 3 days old. `distilled_config_hints.jsonl` similar. If the extraction pipeline that produces them has rotted, building on top of them propagates rot. Mitigation: run the distillation extractor once on a fresh run before treating these as canonical; verify schema_version still matches.
4. **No-leak invariant on SFT.** The spec is non-negotiable: rejected runs must NEVER appear in `exports/sft/instruction_response.jsonl`. Easy to violate via JOIN bugs. Mitigation: SFT export reads only `category=accepted` rows from `scored-runs/*.jsonl`; tests enforce this with a fixture containing rejected/partial mix.
5. **Provenance integrity.** Every export row must trace to a source jsonl row. Mitigation: `provenance` field is `{source_file, line_offset, sig_hash}`; export-side validator checks each row's source_file exists and contains a row with matching sig_hash.
6. **Receipts as security theater.** A receipt that just says "ran successfully" is worse than nothing. Mitigation: receipts include git_sha, sha256 of input/output files, record_counts (in vs out), and an explicit `validation_pass` boolean tied to schema validators.
7. **Hybrid index keyword side.** Adding BM25 over Parquet via DataFusion is doable but requires a custom UDF. If we punt this to "later," the hybrid in HybridIndexer is dishonest naming. Mitigation: ship Phase 1-5 with semantic-only and rename the module `SemanticIndexer`; add BM25 in a follow-up phase rather than claiming hybrid prematurely.
8. **Upstream model outage.** Just observed: `kimi-k2:1t` is currently 500-ing on Ollama Cloud. If distillation pipeline depends on a single model for verification, an outage breaks the whole pipeline. Mitigation: deterministic validators must NOT call any LLM; only the LLM-driven steps (initial extraction) should depend on cloud. Failures degrade gracefully — extracted text gets routed to `needs_human_review` not silently dropped.
---
## 6. Recommended integration points
1. **Reuse `auditor/kb_index.ts` as the EvidenceCollector substrate.** It already reads JSONL streams. Extend it to emit the unified EvidenceRecord by JOINing across streams by `run_id`/`file`/`sig_hash`.
2. **Reuse `crates/shared/src/profiles/` as the schema home for model ledger entries.** `MemoryProfile` and `RetrievalProfile` are already typed. Add `ModelRoutingLedger` alongside.
3. **Reuse `mode_experiments.jsonl` as the per-call truth source.** It's the most complete record per call (mode, model, sources, response, latency_ms, ts). Treat it as the canonical "execution trace" for any /v1/mode/execute call.
4. **Reuse `data/vectors/*` as the HybridIndexer storage.** Don't add a parallel index — the Parquet + HNSW pattern is already proven. The new RAG export emits TO an existing-shaped corpus.
5. **Reuse `scripts/build_*_corpus.ts` as the dataset-building convention.** They're already idempotent, take env knobs (LH_GATEWAY, LH_CHUNK_SIZE, LH_OVERLAP), and POST to `/vectors/index`. The new export scripts follow the same shape.
6. **Reuse `mcp-server/observer.ts` as the validation event sink.** Distillation pipeline stages emit `/event` calls so a future UI can show pipeline progress alongside scrum + scenario events.
7. **Reuse `auditor/policy.ts` as the gate-pattern reference.** The 5-gate scrum_applier and the `policy.ts` severity dispatch both encode the discipline of "deterministic check first, model opinion never." Success Scorer follows the same pattern.
8. **Reuse `contract_analyses.jsonl` as the EvidenceRecord prototype.** It's the closest existing schema to what now.md asks for. Migrate its fields into the unified EvidenceRecord; backfill its rows into `data/evidence/`.
---
## 7. Schemas to formalize in Phase 1
Based on the inventory above, the schemas Phase 1 needs to define are:
1. **EvidenceRecord** — derived from `contract_analyses` + `mode_experiments` + observer fields + the spec's required fields (run_id, task_id, timestamp, model_name, model_role, input_hash, output_hash, source_files, commands_run, retrieved_context, observer_notes, scratchpad_summary, success_markers, failure_markers, validation_results, human_override, provenance)
2. **ScoredRun**`{evidence_run_id, category in {accepted, partially_accepted, rejected, needs_human_review}, reasons: string[], scored_at, scorer_version}`
3. **Playbook**`{playbook_id, task_type, problem_pattern, useful_context, model_routing_path, commands_worked, commands_failed, validation_steps, repo_files_touched, recovery_strategy, known_failure_modes, escalation_threshold, acceptance_criteria, source_run_ids, created_at}`
4. **ScratchpadSummary**`{run_id, current_objective, completed_steps, failed_steps, pending_steps, important_paths, decisions, unresolved_questions, validation_status, next_command, source_scratchpad_hash}`
5. **ModelLedgerEntry**`{model_name, model_provider, task_type, success_rate, failure_modes, best_partner_model, escalation_role, cost, latency_p50, latency_p95, context_window, sample_count, last_updated}`
6. **RagSample** — spec shape exactly
7. **SftSample** — spec shape exactly + strict `score=accepted` invariant
8. **PreferenceSample** — spec shape exactly + `chosen != rejected` invariant
9. **Receipt**`{command, git_sha, input_files: [{path, sha256}], output_files: [{path, sha256}], record_counts: {in, out}, validation_pass, errors, warnings, duration_ms, started_at, ended_at}`
Each schema lands in `crates/shared/src/schemas/distillation/` (Rust source-of-truth) + `auditor/schemas/distillation/` (TS validators). Phase 1 acceptance: every schema has 2+ positive fixtures (drawn from existing JSONL rows) and 2+ negative fixtures (missing required, wrong type, no provenance).
---
## 8. Phase 1 readiness checklist
Before Phase 1 starts, the following must be true:
- [x] Recon doc exists (this file)
- [x] Sample shapes captured for the 8+ source JSONLs the schemas derive from
- [x] Existing distilled_* streams audited — confirmed they're prototypes, not blockers
- [x] Existing vector corpora inventoried — confirmed HybridIndexer Layer 2 substrate is real
- [x] Risks listed with mitigations
- [x] Integration points named — derive, don't reinvent
Phase 1 is unblocked after this document is reviewed by the user. Implementation begins with `crates/shared/src/schemas/distillation/evidence_record.rs` + matching `auditor/schemas/distillation/evidence_record.ts` Zod validator + 2/2 fixtures from `distilled_facts.jsonl` and `contract_analyses.jsonl`.
---
## 9. What this document is NOT
- Not a green-light to start implementation. The spec is explicit: schemas first, then everything else.
- Not a commitment to build all 9 schemas in parallel. Phase 1 ships the EvidenceRecord schema alone if necessary, with the others queued behind it.
- Not a replacement for the spec at `/home/profit/now.md`. Spec is canonical; this document maps spec onto current state.
- Not a survey of the staffing pipeline (`crates/validator/staffing/*`, scenarios/, etc.). Distillation is orthogonal — the staffing pipeline is one of the many sources distillation reads from, not its target.

View File

@ -0,0 +1,253 @@
// Source-row → EvidenceRecord transforms. Promoted from
// auditor/schemas/distillation/realdata.test.ts PROBES array. Each
// transform is pure: no I/O, no model calls, no clock reads (caller
// supplies recorded_at). Deterministic by construction so re-running
// the materializer on identical input produces byte-identical output.
//
// Adding a new source: append a TransformDef. Order in TRANSFORMS[]
// has no effect (each runs against its own source_file).
import type { EvidenceRecord, ModelRole } from "../../auditor/schemas/distillation/evidence_record";
import { EVIDENCE_SCHEMA_VERSION } from "../../auditor/schemas/distillation/evidence_record";
import { canonicalSha256 } from "../../auditor/schemas/distillation/types";
export interface TransformInput {
row: any;
line_offset: number;
source_file_relpath: string; // relative to repo root
recorded_at: string; // ISO 8601 — caller's "now"
sig_hash: string; // canonical sha256 of orderedKeys(row), pre-computed by caller
}
export interface TransformDef {
source_file_relpath: string; // relative to repo root, e.g. "data/_kb/distilled_facts.jsonl"
transform: (input: TransformInput) => Partial<EvidenceRecord> | null;
}
function provenance(input: TransformInput): EvidenceRecord["provenance"] {
return {
source_file: input.source_file_relpath,
line_offset: input.line_offset,
sig_hash: input.sig_hash,
recorded_at: input.recorded_at,
};
}
const TIME_TO_MS = (iso: string): number => new Date(iso).getTime();
export const TRANSFORMS: TransformDef[] = [
// ── Tier 1: validated 100% in Phase 1 ─────────────────────────────
{
source_file_relpath: "data/_kb/distilled_facts.jsonl",
transform: ({ row, line_offset, source_file_relpath, recorded_at, sig_hash }) => ({
run_id: String(row.run_id ?? `distilled_facts:${line_offset}`),
task_id: String(row.source_label ?? `distilled_facts:${line_offset}`),
timestamp: row.created_at,
schema_version: EVIDENCE_SCHEMA_VERSION,
provenance: provenance({ row, line_offset, source_file_relpath, recorded_at, sig_hash }),
model_name: row.extractor,
model_role: "extractor" as ModelRole,
model_provider: "ollama",
text: row.text,
}),
},
{
source_file_relpath: "data/_kb/distilled_procedures.jsonl",
transform: ({ row, line_offset, source_file_relpath, recorded_at, sig_hash }) => ({
run_id: String(row.run_id ?? `distilled_procedures:${line_offset}`),
task_id: String(row.source_label ?? `distilled_procedures:${line_offset}`),
timestamp: row.created_at,
schema_version: EVIDENCE_SCHEMA_VERSION,
provenance: provenance({ row, line_offset, source_file_relpath, recorded_at, sig_hash }),
model_name: row.extractor,
model_role: "extractor" as ModelRole,
model_provider: "ollama",
text: row.text,
}),
},
{
source_file_relpath: "data/_kb/distilled_config_hints.jsonl",
transform: ({ row, line_offset, source_file_relpath, recorded_at, sig_hash }) => ({
run_id: String(row.run_id ?? `distilled_config_hints:${line_offset}`),
task_id: String(row.source_label ?? `distilled_config_hints:${line_offset}`),
timestamp: row.created_at,
schema_version: EVIDENCE_SCHEMA_VERSION,
provenance: provenance({ row, line_offset, source_file_relpath, recorded_at, sig_hash }),
model_name: row.extractor,
model_role: "extractor" as ModelRole,
model_provider: "ollama",
text: row.text,
}),
},
{
source_file_relpath: "data/_kb/contract_analyses.jsonl",
transform: ({ row, line_offset, source_file_relpath, recorded_at, sig_hash }) => ({
run_id: `contract_analysis:${row.permit_id}:${TIME_TO_MS(row.ts)}`,
task_id: `permit:${row.permit_id}`,
timestamp: row.ts,
schema_version: EVIDENCE_SCHEMA_VERSION,
provenance: provenance({ row, line_offset, source_file_relpath, recorded_at, sig_hash }),
model_role: "executor" as ModelRole,
retrieved_context: {
matrix_corpora: Object.keys(row.matrix_corpora ?? {}),
matrix_hits: row.matrix_hits,
},
observer_notes: row.observer_notes ? [row.observer_notes].flat().filter(Boolean) : undefined,
observer_verdict: row.observer_verdict,
observer_confidence: row.observer_conf,
success_markers: row.ok ? ["matrix_hits_above_threshold"] : undefined,
failure_markers: !row.ok || row.observer_verdict === "reject" ? ["observer_rejected"] : undefined,
cost_usd: typeof row.cost === "number" ? row.cost / 1_000_000 : undefined,
latency_ms: row.duration_ms,
text: row.analysis,
}),
},
{
source_file_relpath: "data/_kb/mode_experiments.jsonl",
transform: ({ row, line_offset, source_file_relpath, recorded_at, sig_hash }) => ({
run_id: `mode_exec:${TIME_TO_MS(row.ts)}:${row.file_path ?? line_offset}`,
task_id: row.task_class,
timestamp: row.ts,
schema_version: EVIDENCE_SCHEMA_VERSION,
provenance: provenance({ row, line_offset, source_file_relpath, recorded_at, sig_hash }),
model_name: row.model,
model_role: "executor" as ModelRole,
model_provider: row.model?.includes("/") ? "openrouter" : "ollama_cloud",
retrieved_context: {
matrix_corpora: row.sources?.matrix_corpus,
matrix_chunks_kept: row.sources?.matrix_chunks_kept,
matrix_chunks_dropped: row.sources?.matrix_chunks_dropped,
pathway_fingerprints_seen: row.sources?.bug_fingerprints_count,
},
latency_ms: row.latency_ms,
text: row.response,
source_files: row.file_path ? [row.file_path] : undefined,
}),
},
{
source_file_relpath: "data/_kb/scrum_reviews.jsonl",
transform: ({ row, line_offset, source_file_relpath, recorded_at, sig_hash }) => ({
run_id: `scrum:${TIME_TO_MS(row.reviewed_at)}:${row.file}`,
task_id: `scrum_review:${row.file}`,
timestamp: row.reviewed_at,
schema_version: EVIDENCE_SCHEMA_VERSION,
provenance: provenance({ row, line_offset, source_file_relpath, recorded_at, sig_hash }),
model_name: row.accepted_model,
model_role: "executor" as ModelRole,
source_files: [row.file],
success_markers: row.accepted_on_attempt ? [`accepted_on_attempt_${row.accepted_on_attempt}`] : undefined,
text: row.suggestions_preview,
}),
},
{
source_file_relpath: "data/_kb/observer_escalations.jsonl",
transform: ({ row, line_offset, source_file_relpath, recorded_at, sig_hash }) => ({
run_id: `obs_esc:${TIME_TO_MS(row.ts)}:${row.sig_hash}`,
task_id: `observer_escalation:${row.cluster_endpoint ?? "?"}`,
timestamp: row.ts,
schema_version: EVIDENCE_SCHEMA_VERSION,
provenance: provenance({ row, line_offset, source_file_relpath, recorded_at, sig_hash }),
model_role: "reviewer" as ModelRole,
prompt_tokens: row.prompt_tokens,
completion_tokens: row.completion_tokens,
text: row.analysis,
}),
},
{
source_file_relpath: "data/_kb/audit_facts.jsonl",
transform: ({ row, line_offset, source_file_relpath, recorded_at, sig_hash }) => ({
run_id: `audit_facts:${row.head_sha}:${line_offset}`,
task_id: `pr:${row.pr_number}`,
timestamp: row.extracted_at,
schema_version: EVIDENCE_SCHEMA_VERSION,
provenance: provenance({ row, line_offset, source_file_relpath, recorded_at, sig_hash }),
model_name: row.extractor,
model_role: "extractor" as ModelRole,
text: JSON.stringify({
facts: row.facts?.length ?? 0,
entities: row.entities?.length ?? 0,
relationships: row.relationships?.length ?? 0,
}),
}),
},
// ── Tier 2: untested streams that still belong in EvidenceRecord ──
{
// auto_apply.jsonl is metadata-only (no text payload). Keep the row
// in evidence so success/failure markers contribute to scoring,
// even though the text field is empty.
source_file_relpath: "data/_kb/auto_apply.jsonl",
transform: ({ row, line_offset, source_file_relpath, recorded_at, sig_hash }) => {
const ts: string = row.ts ?? new Date().toISOString();
const action = String(row.action ?? "unknown");
const success = action === "committed";
const reverted = action.includes("reverted");
return {
run_id: `auto_apply:${TIME_TO_MS(ts)}:${row.file ?? line_offset}`,
task_id: `auto_apply:${row.file ?? "?"}`,
timestamp: ts,
schema_version: EVIDENCE_SCHEMA_VERSION,
provenance: provenance({ row, line_offset, source_file_relpath, recorded_at, sig_hash }),
model_role: "applier" as ModelRole,
source_files: row.file ? [row.file] : undefined,
success_markers: success ? ["committed"] : undefined,
failure_markers: reverted ? [action] : undefined,
};
},
},
{
source_file_relpath: "data/_kb/observer_reviews.jsonl",
transform: ({ row, line_offset, source_file_relpath, recorded_at, sig_hash }) => ({
run_id: `obs_rev:${TIME_TO_MS(row.ts ?? row.reviewed_at)}:${row.file ?? line_offset}`,
task_id: row.file ? `observer_review:${row.file}` : `observer_review:${line_offset}`,
timestamp: row.ts ?? row.reviewed_at,
schema_version: EVIDENCE_SCHEMA_VERSION,
provenance: provenance({ row, line_offset, source_file_relpath, recorded_at, sig_hash }),
model_role: "reviewer" as ModelRole,
observer_verdict: row.verdict,
observer_confidence: row.confidence,
observer_notes: row.notes ? [row.notes].flat().filter(Boolean) : undefined,
text: row.notes ?? row.review ?? undefined,
}),
},
{
source_file_relpath: "data/_kb/audits.jsonl",
transform: ({ row, line_offset, source_file_relpath, recorded_at, sig_hash }) => ({
run_id: `audit:${row.head_sha ?? line_offset}`,
task_id: `pr:${row.pr_number}`,
timestamp: row.audited_at ?? new Date().toISOString(),
schema_version: EVIDENCE_SCHEMA_VERSION,
provenance: provenance({ row, line_offset, source_file_relpath, recorded_at, sig_hash }),
model_role: "reviewer" as ModelRole,
success_markers: row.overall === "approve" ? ["approved"] : undefined,
failure_markers: row.overall === "block" ? ["blocked"] : (row.overall === "request_changes" ? ["request_changes"] : undefined),
validation_results: { schema_valid: true, [`overall_${row.overall ?? "?"}`]: true },
text: row.one_liner ?? "",
}),
},
{
source_file_relpath: "data/_kb/outcomes.jsonl",
transform: ({ row, line_offset, source_file_relpath, recorded_at, sig_hash }) => ({
run_id: `outcome:${row.run_id ?? line_offset}`,
task_id: row.sig_hash ? `outcome_sig:${row.sig_hash}` : `outcome:${line_offset}`,
timestamp: row.created_at,
schema_version: EVIDENCE_SCHEMA_VERSION,
provenance: provenance({ row, line_offset, source_file_relpath, recorded_at, sig_hash }),
model_role: "executor" as ModelRole,
latency_ms: typeof row.elapsed_secs === "number" ? Math.round(row.elapsed_secs * 1000) : undefined,
success_markers: typeof row.ok_events === "number" && typeof row.total_events === "number"
? (row.ok_events === row.total_events && row.total_events > 0 ? ["all_events_ok"] : undefined)
: undefined,
validation_results: typeof row.total_gap_signals === "number"
? { gap_signals: row.total_gap_signals, citation_count: row.total_citations }
: undefined,
}),
},
];
export function transformByPath(source_file_relpath: string): TransformDef | undefined {
return TRANSFORMS.find(t => t.source_file_relpath === source_file_relpath);
}
// Re-export for materializer convenience.
export { canonicalSha256 };