root 27b1d27605
Some checks failed
lakehouse/auditor 9 blocking issues: todo!() macro call in tests/real-world/scrum_master_pipeline.ts
distillation: Phase 0 recon + Phase 1 schemas + Phase 2 transforms scaffold
Phase 0 — docs/recon/local-distillation-recon.md
Inventories the 23 KB JSONL streams + 20 vector corpora + auditor's
kb_index.ts as substrate for the now.md distillation pipeline. Maps
spec modules to existing producers, identifies real gaps, lists 9
schemas to formalize. ZERO implementation in recon — gating doc only.

Phase 1 — auditor/schemas/distillation/
9 schemas + foundation types + 48 tests passing in 502ms:

  types.ts                      shared validators + canonicalSha256
  evidence_record.ts            EVIDENCE_SCHEMA_VERSION=1, ModelRole enum
  scored_run.ts                 4 categories pinned, anchor_grounding ∈ [0,1]
  receipt.ts                    git_sha 40-char, sha256 file refs, validation_pass:bool
  playbook.ts                   non-empty source_run_ids + acceptance_criteria
  scratchpad_summary.ts         validation_status enum, hash sha256
  model_ledger.ts               success_rate ∈ [0,1], sample_count ≥ 1
  rag_sample.ts                 success_score ∈ {accepted, partially_accepted}
  sft_sample.ts                 quality_score MUST be 'accepted' (no leak)
  preference_sample.ts          chosen != rejected, source_run_ids must differ
  evidence_record.test.ts       10 tests, JSON-fixture round-trip
  schemas.test.ts               30 tests, inline fixtures
  realdata.test.ts              8 tests, real-JSONL probe

Real-data validation probe (one of the 3 notables from recon):
46 rows across 7 sources, 100% pass. distilled_facts/procedures alive.
Report at data/_kb/realdata_validation_report.md (also written by the
test). Confirms schema fits existing producers without migration.

Phase 2 scaffold — scripts/distillation/transforms.ts
Promoted PROBES from realdata.test.ts into a real TRANSFORMS array
covering 12 source streams (8 Tier 1 validated + 4 Tier 2 from
recon's untested-streams list). Pure functions: no I/O, no model
calls, no clock reads. Caller supplies recorded_at + sig_hash so
materializer is deterministic by construction.

Spec non-negotiables enforced at schema layer (defense in depth):
  - provenance{source_file, sig_hash, recorded_at} required everywhere
  - schema_version mismatch hard-rejects (forward-compat gate)
  - SFT no-leak: validateSftSample REJECTS partially_accepted, rejected,
    needs_human_review — three explicit tests
  - Every score has WHY (reasons non-empty)
  - Every playbook traces to source (source_run_ids non-empty)
  - Every preference has WHY (reason non-empty)
  - Receipts substantive (git_sha 40-char, sha256 64-char, validation_pass:bool)

Branch carries uncommitted auditor rebuild work (mode.rs + modes.toml
+ inference.ts + static.ts) blocked on upstream Ollama Cloud kimi-k2
500 ISE; held pending recon-driven design decisions.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-26 22:30:38 -05:00

254 lines
12 KiB
TypeScript

// Source-row → EvidenceRecord transforms. Promoted from
// auditor/schemas/distillation/realdata.test.ts PROBES array. Each
// transform is pure: no I/O, no model calls, no clock reads (caller
// supplies recorded_at). Deterministic by construction so re-running
// the materializer on identical input produces byte-identical output.
//
// Adding a new source: append a TransformDef. Order in TRANSFORMS[]
// has no effect (each runs against its own source_file).
import type { EvidenceRecord, ModelRole } from "../../auditor/schemas/distillation/evidence_record";
import { EVIDENCE_SCHEMA_VERSION } from "../../auditor/schemas/distillation/evidence_record";
import { canonicalSha256 } from "../../auditor/schemas/distillation/types";
export interface TransformInput {
row: any;
line_offset: number;
source_file_relpath: string; // relative to repo root
recorded_at: string; // ISO 8601 — caller's "now"
sig_hash: string; // canonical sha256 of orderedKeys(row), pre-computed by caller
}
export interface TransformDef {
source_file_relpath: string; // relative to repo root, e.g. "data/_kb/distilled_facts.jsonl"
transform: (input: TransformInput) => Partial<EvidenceRecord> | null;
}
function provenance(input: TransformInput): EvidenceRecord["provenance"] {
return {
source_file: input.source_file_relpath,
line_offset: input.line_offset,
sig_hash: input.sig_hash,
recorded_at: input.recorded_at,
};
}
const TIME_TO_MS = (iso: string): number => new Date(iso).getTime();
export const TRANSFORMS: TransformDef[] = [
// ── Tier 1: validated 100% in Phase 1 ─────────────────────────────
{
source_file_relpath: "data/_kb/distilled_facts.jsonl",
transform: ({ row, line_offset, source_file_relpath, recorded_at, sig_hash }) => ({
run_id: String(row.run_id ?? `distilled_facts:${line_offset}`),
task_id: String(row.source_label ?? `distilled_facts:${line_offset}`),
timestamp: row.created_at,
schema_version: EVIDENCE_SCHEMA_VERSION,
provenance: provenance({ row, line_offset, source_file_relpath, recorded_at, sig_hash }),
model_name: row.extractor,
model_role: "extractor" as ModelRole,
model_provider: "ollama",
text: row.text,
}),
},
{
source_file_relpath: "data/_kb/distilled_procedures.jsonl",
transform: ({ row, line_offset, source_file_relpath, recorded_at, sig_hash }) => ({
run_id: String(row.run_id ?? `distilled_procedures:${line_offset}`),
task_id: String(row.source_label ?? `distilled_procedures:${line_offset}`),
timestamp: row.created_at,
schema_version: EVIDENCE_SCHEMA_VERSION,
provenance: provenance({ row, line_offset, source_file_relpath, recorded_at, sig_hash }),
model_name: row.extractor,
model_role: "extractor" as ModelRole,
model_provider: "ollama",
text: row.text,
}),
},
{
source_file_relpath: "data/_kb/distilled_config_hints.jsonl",
transform: ({ row, line_offset, source_file_relpath, recorded_at, sig_hash }) => ({
run_id: String(row.run_id ?? `distilled_config_hints:${line_offset}`),
task_id: String(row.source_label ?? `distilled_config_hints:${line_offset}`),
timestamp: row.created_at,
schema_version: EVIDENCE_SCHEMA_VERSION,
provenance: provenance({ row, line_offset, source_file_relpath, recorded_at, sig_hash }),
model_name: row.extractor,
model_role: "extractor" as ModelRole,
model_provider: "ollama",
text: row.text,
}),
},
{
source_file_relpath: "data/_kb/contract_analyses.jsonl",
transform: ({ row, line_offset, source_file_relpath, recorded_at, sig_hash }) => ({
run_id: `contract_analysis:${row.permit_id}:${TIME_TO_MS(row.ts)}`,
task_id: `permit:${row.permit_id}`,
timestamp: row.ts,
schema_version: EVIDENCE_SCHEMA_VERSION,
provenance: provenance({ row, line_offset, source_file_relpath, recorded_at, sig_hash }),
model_role: "executor" as ModelRole,
retrieved_context: {
matrix_corpora: Object.keys(row.matrix_corpora ?? {}),
matrix_hits: row.matrix_hits,
},
observer_notes: row.observer_notes ? [row.observer_notes].flat().filter(Boolean) : undefined,
observer_verdict: row.observer_verdict,
observer_confidence: row.observer_conf,
success_markers: row.ok ? ["matrix_hits_above_threshold"] : undefined,
failure_markers: !row.ok || row.observer_verdict === "reject" ? ["observer_rejected"] : undefined,
cost_usd: typeof row.cost === "number" ? row.cost / 1_000_000 : undefined,
latency_ms: row.duration_ms,
text: row.analysis,
}),
},
{
source_file_relpath: "data/_kb/mode_experiments.jsonl",
transform: ({ row, line_offset, source_file_relpath, recorded_at, sig_hash }) => ({
run_id: `mode_exec:${TIME_TO_MS(row.ts)}:${row.file_path ?? line_offset}`,
task_id: row.task_class,
timestamp: row.ts,
schema_version: EVIDENCE_SCHEMA_VERSION,
provenance: provenance({ row, line_offset, source_file_relpath, recorded_at, sig_hash }),
model_name: row.model,
model_role: "executor" as ModelRole,
model_provider: row.model?.includes("/") ? "openrouter" : "ollama_cloud",
retrieved_context: {
matrix_corpora: row.sources?.matrix_corpus,
matrix_chunks_kept: row.sources?.matrix_chunks_kept,
matrix_chunks_dropped: row.sources?.matrix_chunks_dropped,
pathway_fingerprints_seen: row.sources?.bug_fingerprints_count,
},
latency_ms: row.latency_ms,
text: row.response,
source_files: row.file_path ? [row.file_path] : undefined,
}),
},
{
source_file_relpath: "data/_kb/scrum_reviews.jsonl",
transform: ({ row, line_offset, source_file_relpath, recorded_at, sig_hash }) => ({
run_id: `scrum:${TIME_TO_MS(row.reviewed_at)}:${row.file}`,
task_id: `scrum_review:${row.file}`,
timestamp: row.reviewed_at,
schema_version: EVIDENCE_SCHEMA_VERSION,
provenance: provenance({ row, line_offset, source_file_relpath, recorded_at, sig_hash }),
model_name: row.accepted_model,
model_role: "executor" as ModelRole,
source_files: [row.file],
success_markers: row.accepted_on_attempt ? [`accepted_on_attempt_${row.accepted_on_attempt}`] : undefined,
text: row.suggestions_preview,
}),
},
{
source_file_relpath: "data/_kb/observer_escalations.jsonl",
transform: ({ row, line_offset, source_file_relpath, recorded_at, sig_hash }) => ({
run_id: `obs_esc:${TIME_TO_MS(row.ts)}:${row.sig_hash}`,
task_id: `observer_escalation:${row.cluster_endpoint ?? "?"}`,
timestamp: row.ts,
schema_version: EVIDENCE_SCHEMA_VERSION,
provenance: provenance({ row, line_offset, source_file_relpath, recorded_at, sig_hash }),
model_role: "reviewer" as ModelRole,
prompt_tokens: row.prompt_tokens,
completion_tokens: row.completion_tokens,
text: row.analysis,
}),
},
{
source_file_relpath: "data/_kb/audit_facts.jsonl",
transform: ({ row, line_offset, source_file_relpath, recorded_at, sig_hash }) => ({
run_id: `audit_facts:${row.head_sha}:${line_offset}`,
task_id: `pr:${row.pr_number}`,
timestamp: row.extracted_at,
schema_version: EVIDENCE_SCHEMA_VERSION,
provenance: provenance({ row, line_offset, source_file_relpath, recorded_at, sig_hash }),
model_name: row.extractor,
model_role: "extractor" as ModelRole,
text: JSON.stringify({
facts: row.facts?.length ?? 0,
entities: row.entities?.length ?? 0,
relationships: row.relationships?.length ?? 0,
}),
}),
},
// ── Tier 2: untested streams that still belong in EvidenceRecord ──
{
// auto_apply.jsonl is metadata-only (no text payload). Keep the row
// in evidence so success/failure markers contribute to scoring,
// even though the text field is empty.
source_file_relpath: "data/_kb/auto_apply.jsonl",
transform: ({ row, line_offset, source_file_relpath, recorded_at, sig_hash }) => {
const ts: string = row.ts ?? new Date().toISOString();
const action = String(row.action ?? "unknown");
const success = action === "committed";
const reverted = action.includes("reverted");
return {
run_id: `auto_apply:${TIME_TO_MS(ts)}:${row.file ?? line_offset}`,
task_id: `auto_apply:${row.file ?? "?"}`,
timestamp: ts,
schema_version: EVIDENCE_SCHEMA_VERSION,
provenance: provenance({ row, line_offset, source_file_relpath, recorded_at, sig_hash }),
model_role: "applier" as ModelRole,
source_files: row.file ? [row.file] : undefined,
success_markers: success ? ["committed"] : undefined,
failure_markers: reverted ? [action] : undefined,
};
},
},
{
source_file_relpath: "data/_kb/observer_reviews.jsonl",
transform: ({ row, line_offset, source_file_relpath, recorded_at, sig_hash }) => ({
run_id: `obs_rev:${TIME_TO_MS(row.ts ?? row.reviewed_at)}:${row.file ?? line_offset}`,
task_id: row.file ? `observer_review:${row.file}` : `observer_review:${line_offset}`,
timestamp: row.ts ?? row.reviewed_at,
schema_version: EVIDENCE_SCHEMA_VERSION,
provenance: provenance({ row, line_offset, source_file_relpath, recorded_at, sig_hash }),
model_role: "reviewer" as ModelRole,
observer_verdict: row.verdict,
observer_confidence: row.confidence,
observer_notes: row.notes ? [row.notes].flat().filter(Boolean) : undefined,
text: row.notes ?? row.review ?? undefined,
}),
},
{
source_file_relpath: "data/_kb/audits.jsonl",
transform: ({ row, line_offset, source_file_relpath, recorded_at, sig_hash }) => ({
run_id: `audit:${row.head_sha ?? line_offset}`,
task_id: `pr:${row.pr_number}`,
timestamp: row.audited_at ?? new Date().toISOString(),
schema_version: EVIDENCE_SCHEMA_VERSION,
provenance: provenance({ row, line_offset, source_file_relpath, recorded_at, sig_hash }),
model_role: "reviewer" as ModelRole,
success_markers: row.overall === "approve" ? ["approved"] : undefined,
failure_markers: row.overall === "block" ? ["blocked"] : (row.overall === "request_changes" ? ["request_changes"] : undefined),
validation_results: { schema_valid: true, [`overall_${row.overall ?? "?"}`]: true },
text: row.one_liner ?? "",
}),
},
{
source_file_relpath: "data/_kb/outcomes.jsonl",
transform: ({ row, line_offset, source_file_relpath, recorded_at, sig_hash }) => ({
run_id: `outcome:${row.run_id ?? line_offset}`,
task_id: row.sig_hash ? `outcome_sig:${row.sig_hash}` : `outcome:${line_offset}`,
timestamp: row.created_at,
schema_version: EVIDENCE_SCHEMA_VERSION,
provenance: provenance({ row, line_offset, source_file_relpath, recorded_at, sig_hash }),
model_role: "executor" as ModelRole,
latency_ms: typeof row.elapsed_secs === "number" ? Math.round(row.elapsed_secs * 1000) : undefined,
success_markers: typeof row.ok_events === "number" && typeof row.total_events === "number"
? (row.ok_events === row.total_events && row.total_events > 0 ? ["all_events_ok"] : undefined)
: undefined,
validation_results: typeof row.total_gap_signals === "number"
? { gap_signals: row.total_gap_signals, citation_count: row.total_citations }
: undefined,
}),
},
];
export function transformByPath(source_file_relpath: string): TransformDef | undefined {
return TRANSFORMS.find(t => t.source_file_relpath === source_file_relpath);
}
// Re-export for materializer convenience.
export { canonicalSha256 };