// Source-row → EvidenceRecord transforms. Promoted from // auditor/schemas/distillation/realdata.test.ts PROBES array. Each // transform is pure: no I/O, no model calls, no clock reads (caller // supplies recorded_at). Deterministic by construction so re-running // the materializer on identical input produces byte-identical output. // // Adding a new source: append a TransformDef. Order in TRANSFORMS[] // has no effect (each runs against its own source_file). import type { EvidenceRecord, ModelRole } from "../../auditor/schemas/distillation/evidence_record"; import { EVIDENCE_SCHEMA_VERSION } from "../../auditor/schemas/distillation/evidence_record"; import { canonicalSha256 } from "../../auditor/schemas/distillation/types"; export interface TransformInput { row: any; line_offset: number; source_file_relpath: string; // relative to repo root recorded_at: string; // ISO 8601 — caller's "now" sig_hash: string; // canonical sha256 of orderedKeys(row), pre-computed by caller } export interface TransformDef { source_file_relpath: string; // relative to repo root, e.g. "data/_kb/distilled_facts.jsonl" transform: (input: TransformInput) => Partial | null; } function provenance(input: TransformInput): EvidenceRecord["provenance"] { return { source_file: input.source_file_relpath, line_offset: input.line_offset, sig_hash: input.sig_hash, recorded_at: input.recorded_at, }; } const TIME_TO_MS = (iso: string): number => new Date(iso).getTime(); export const TRANSFORMS: TransformDef[] = [ // ── Tier 1: validated 100% in Phase 1 ───────────────────────────── { source_file_relpath: "data/_kb/distilled_facts.jsonl", transform: ({ row, line_offset, source_file_relpath, recorded_at, sig_hash }) => ({ run_id: String(row.run_id ?? `distilled_facts:${line_offset}`), task_id: String(row.source_label ?? `distilled_facts:${line_offset}`), timestamp: row.created_at, schema_version: EVIDENCE_SCHEMA_VERSION, provenance: provenance({ row, line_offset, source_file_relpath, recorded_at, sig_hash }), model_name: row.extractor, model_role: "extractor" as ModelRole, model_provider: "ollama", text: row.text, }), }, { source_file_relpath: "data/_kb/distilled_procedures.jsonl", transform: ({ row, line_offset, source_file_relpath, recorded_at, sig_hash }) => ({ run_id: String(row.run_id ?? `distilled_procedures:${line_offset}`), task_id: String(row.source_label ?? `distilled_procedures:${line_offset}`), timestamp: row.created_at, schema_version: EVIDENCE_SCHEMA_VERSION, provenance: provenance({ row, line_offset, source_file_relpath, recorded_at, sig_hash }), model_name: row.extractor, model_role: "extractor" as ModelRole, model_provider: "ollama", text: row.text, }), }, { source_file_relpath: "data/_kb/distilled_config_hints.jsonl", transform: ({ row, line_offset, source_file_relpath, recorded_at, sig_hash }) => ({ run_id: String(row.run_id ?? `distilled_config_hints:${line_offset}`), task_id: String(row.source_label ?? `distilled_config_hints:${line_offset}`), timestamp: row.created_at, schema_version: EVIDENCE_SCHEMA_VERSION, provenance: provenance({ row, line_offset, source_file_relpath, recorded_at, sig_hash }), model_name: row.extractor, model_role: "extractor" as ModelRole, model_provider: "ollama", text: row.text, }), }, { source_file_relpath: "data/_kb/contract_analyses.jsonl", transform: ({ row, line_offset, source_file_relpath, recorded_at, sig_hash }) => ({ run_id: `contract_analysis:${row.permit_id}:${TIME_TO_MS(row.ts)}`, task_id: `permit:${row.permit_id}`, timestamp: row.ts, schema_version: EVIDENCE_SCHEMA_VERSION, provenance: provenance({ row, line_offset, source_file_relpath, recorded_at, sig_hash }), model_role: "executor" as ModelRole, retrieved_context: { matrix_corpora: Object.keys(row.matrix_corpora ?? {}), matrix_hits: row.matrix_hits, }, observer_notes: row.observer_notes ? [row.observer_notes].flat().filter(Boolean) : undefined, observer_verdict: row.observer_verdict, observer_confidence: row.observer_conf, success_markers: row.ok ? ["matrix_hits_above_threshold"] : undefined, failure_markers: !row.ok || row.observer_verdict === "reject" ? ["observer_rejected"] : undefined, cost_usd: typeof row.cost === "number" ? row.cost / 1_000_000 : undefined, latency_ms: row.duration_ms, text: row.analysis, metadata: typeof row.contractor === "string" && row.contractor.length > 0 ? { contractor: row.contractor } : undefined, }), }, { source_file_relpath: "data/_kb/mode_experiments.jsonl", transform: ({ row, line_offset, source_file_relpath, recorded_at, sig_hash }) => ({ run_id: `mode_exec:${TIME_TO_MS(row.ts)}:${row.file_path ?? line_offset}`, task_id: row.task_class, timestamp: row.ts, schema_version: EVIDENCE_SCHEMA_VERSION, provenance: provenance({ row, line_offset, source_file_relpath, recorded_at, sig_hash }), model_name: row.model, model_role: "executor" as ModelRole, model_provider: row.model?.includes("/") ? "openrouter" : "ollama_cloud", retrieved_context: { matrix_corpora: row.sources?.matrix_corpus, matrix_chunks_kept: row.sources?.matrix_chunks_kept, matrix_chunks_dropped: row.sources?.matrix_chunks_dropped, pathway_fingerprints_seen: row.sources?.bug_fingerprints_count, }, latency_ms: row.latency_ms, text: row.response, source_files: row.file_path ? [row.file_path] : undefined, }), }, { source_file_relpath: "data/_kb/scrum_reviews.jsonl", transform: ({ row, line_offset, source_file_relpath, recorded_at, sig_hash }) => ({ run_id: `scrum:${TIME_TO_MS(row.reviewed_at)}:${row.file}`, task_id: `scrum_review:${row.file}`, timestamp: row.reviewed_at, schema_version: EVIDENCE_SCHEMA_VERSION, provenance: provenance({ row, line_offset, source_file_relpath, recorded_at, sig_hash }), model_name: row.accepted_model, model_role: "executor" as ModelRole, source_files: [row.file], success_markers: row.accepted_on_attempt ? [`accepted_on_attempt_${row.accepted_on_attempt}`] : undefined, text: row.suggestions_preview, }), }, { source_file_relpath: "data/_kb/observer_escalations.jsonl", transform: ({ row, line_offset, source_file_relpath, recorded_at, sig_hash }) => ({ run_id: `obs_esc:${TIME_TO_MS(row.ts)}:${row.sig_hash}`, task_id: `observer_escalation:${row.cluster_endpoint ?? "?"}`, timestamp: row.ts, schema_version: EVIDENCE_SCHEMA_VERSION, provenance: provenance({ row, line_offset, source_file_relpath, recorded_at, sig_hash }), model_role: "reviewer" as ModelRole, prompt_tokens: row.prompt_tokens, completion_tokens: row.completion_tokens, text: row.analysis, }), }, { source_file_relpath: "data/_kb/audit_facts.jsonl", transform: ({ row, line_offset, source_file_relpath, recorded_at, sig_hash }) => ({ run_id: `audit_facts:${row.head_sha}:${line_offset}`, task_id: `pr:${row.pr_number}`, timestamp: row.extracted_at, schema_version: EVIDENCE_SCHEMA_VERSION, provenance: provenance({ row, line_offset, source_file_relpath, recorded_at, sig_hash }), model_name: row.extractor, model_role: "extractor" as ModelRole, text: JSON.stringify({ facts: row.facts?.length ?? 0, entities: row.entities?.length ?? 0, relationships: row.relationships?.length ?? 0, }), }), }, // ── Tier 2: untested streams that still belong in EvidenceRecord ── { // auto_apply.jsonl is metadata-only (no text payload). Keep the row // in evidence so success/failure markers contribute to scoring, // even though the text field is empty. source_file_relpath: "data/_kb/auto_apply.jsonl", transform: ({ row, line_offset, source_file_relpath, recorded_at, sig_hash }) => { // Deterministic fallback: use the source-file's recorded_at when // the row itself lacks a ts. Wall-clock (new Date()) leaked here // pre-2026-04-27 — broke bit-identical reproducibility on rows // that historically wrote without a ts field. const ts: string = row.ts ?? recorded_at; const action = String(row.action ?? "unknown"); const success = action === "committed"; const reverted = action.includes("reverted"); return { run_id: `auto_apply:${TIME_TO_MS(ts)}:${row.file ?? line_offset}`, task_id: `auto_apply:${row.file ?? "?"}`, timestamp: ts, schema_version: EVIDENCE_SCHEMA_VERSION, provenance: provenance({ row, line_offset, source_file_relpath, recorded_at, sig_hash }), model_role: "applier" as ModelRole, source_files: row.file ? [row.file] : undefined, success_markers: success ? ["committed"] : undefined, failure_markers: reverted ? [action] : undefined, }; }, }, { source_file_relpath: "data/_kb/observer_reviews.jsonl", transform: ({ row, line_offset, source_file_relpath, recorded_at, sig_hash }) => ({ run_id: `obs_rev:${TIME_TO_MS(row.ts ?? row.reviewed_at)}:${row.file ?? line_offset}`, task_id: row.file ? `observer_review:${row.file}` : `observer_review:${line_offset}`, timestamp: row.ts ?? row.reviewed_at, schema_version: EVIDENCE_SCHEMA_VERSION, provenance: provenance({ row, line_offset, source_file_relpath, recorded_at, sig_hash }), model_role: "reviewer" as ModelRole, observer_verdict: row.verdict, observer_confidence: row.confidence, observer_notes: row.notes ? [row.notes].flat().filter(Boolean) : undefined, text: row.notes ?? row.review ?? undefined, }), }, { // 2026-04-26 correction: data/_kb/audits.jsonl is the auditor's // per-FINDING stream (recon misnamed it "PR verdicts"). Schema: // {embedding, evidence, finding_id, phase, resolution, severity, topic, ts} // The actual per-PR verdicts live in data/_auditor/verdicts/*.json, // not in this JSONL. So we score by severity here: info/low → // accepted (audit found minor issue), medium → partially_accepted, // high/critical → rejected (real problem in the audited code). source_file_relpath: "data/_kb/audits.jsonl", transform: ({ row, line_offset, source_file_relpath, recorded_at, sig_hash }) => { const sev = String(row.severity ?? "unknown").toLowerCase(); const minor = sev === "info" || sev === "low"; const blocking = sev === "high" || sev === "critical"; return { run_id: `audit_finding:${row.finding_id ?? line_offset}`, task_id: row.phase ? `phase:${row.phase}` : "audit_finding", timestamp: row.ts ?? new Date().toISOString(), schema_version: EVIDENCE_SCHEMA_VERSION, provenance: provenance({ row, line_offset, source_file_relpath, recorded_at, sig_hash }), model_role: "reviewer" as ModelRole, success_markers: minor ? [`audit_severity_${sev}`] : undefined, failure_markers: blocking ? [`audit_severity_${sev}`] : (sev === "medium" ? ["audit_severity_medium"] : undefined), text: typeof row.evidence === "string" ? row.evidence : (row.resolution ?? ""), }; }, }, { source_file_relpath: "data/_kb/outcomes.jsonl", transform: ({ row, line_offset, source_file_relpath, recorded_at, sig_hash }) => ({ run_id: `outcome:${row.run_id ?? line_offset}`, task_id: row.sig_hash ? `outcome_sig:${row.sig_hash}` : `outcome:${line_offset}`, timestamp: row.created_at, schema_version: EVIDENCE_SCHEMA_VERSION, provenance: provenance({ row, line_offset, source_file_relpath, recorded_at, sig_hash }), model_role: "executor" as ModelRole, latency_ms: typeof row.elapsed_secs === "number" ? Math.round(row.elapsed_secs * 1000) : undefined, success_markers: typeof row.ok_events === "number" && typeof row.total_events === "number" ? (row.ok_events === row.total_events && row.total_events > 0 ? ["all_events_ok"] : undefined) : undefined, validation_results: typeof row.total_gap_signals === "number" ? { gap_signals: row.total_gap_signals, citation_count: row.total_citations } : undefined, }), }, ]; export function transformByPath(source_file_relpath: string): TransformDef | undefined { return TRANSFORMS.find(t => t.source_file_relpath === source_file_relpath); } // Re-export for materializer convenience. export { canonicalSha256 };