remove 7 more orphaned experimental scripts from scripts/

Continuing the test-code-in-main cleanup. These are sequential mode-runner
experiment passes (2/3/4/5) that completed and whose findings were captured
in pathway_memory + the matrix index — the scripts themselves are dead
weight. Plus two one-off probe scripts.

Removed (all 0 refs in production code or automation):
- mode_pass2_corpus_sweep.ts     — 2026-04 corpus sweep experiment
- mode_pass3_variance.ts         — variance measurement run
- mode_pass4_staffing.ts         — staffing-domain pass
- mode_pass5_summarize.ts        — summarization variance
- mode_pass5_variance_paid.ts    — paid-model variance
- overnight_proof.sh             — overnight stress probe (output in logs/)
- ab_t3_test.sh                  — T3 overseer A/B test (output captured in KB)

Verified: 0 references in package.json / justfile / Makefile / any active
.ts/.rs/.sh file. Two mentions remain in docs/recon and docs/MODE_RUNNER_
TUNING_PLAN — those are historical design-doc references, not consumers.

KEPT in scripts/ (have live consumers OR are runtime tools):
- mode_experiment.ts (14 refs), mode_compare.ts (7 refs)
- lance_smoke.sh, build_*_corpus.ts, staffing_demo.py, lance_tune.py,
  generate_demo.py, generate_workers.py, copilot.py, kb_measure.py,
  kb_staffer_report.py, analyze_chicago_contracts.ts, dump_raw_corpus.sh,
  check_phase44_callers.sh, autonomous_agent.py, build_answers_corpus.ts,
  build_lakehouse_corpus.ts, build_scrum_findings_corpus.ts,
  build_symbols_corpus.ts, e2e_pipeline_check.sh, scale_test.py,
  scale_10m_test.sh, run_staffer_demo.sh, stress_test.py

Build clean. If any of these are needed back: git show HEAD~1 -- scripts/<file>

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
root 2026-05-03 02:06:29 -05:00
parent 6aafd41785
commit f4ebd2278b
7 changed files with 0 additions and 1166 deletions

View File

@ -1,77 +0,0 @@
#!/usr/bin/env bash
# A/B test of T3 overseer: does it actually make subsequent runs better?
# Chains Run B (T3 seed) → Run C (T3 + read-back) → Run D (T3 cloud).
# Run A is assumed already complete (launched separately). Aggregates
# metrics at the end into ab_scorecard.json.
set -e
cd "$(dirname "$0")/.."
export OLLAMA_CLOUD_KEY="$(python3 -c "import json; print(json.load(open('/root/llm_team_config.json'))['providers']['ollama_cloud']['api_key'])")"
echo "▶ A/B test start at $(date -Iseconds)"
echo "▶ prior lessons dir: $(ls data/_playbook_lessons 2>/dev/null | wc -l) files"
# Run B — T3 enabled local, no prior lessons should exist yet
echo "──── RUN B: T3 local, seeds first lesson ────"
bun tests/multi-agent/scenario.ts > /tmp/lakehouse_ab_B.log 2>&1 || true
echo " B exit=$?"
ls data/_playbook_lessons/*.json 2>/dev/null | head -5
# Run C — T3 enabled local, B's lesson should load
echo "──── RUN C: T3 local, reads B's lesson ────"
bun tests/multi-agent/scenario.ts > /tmp/lakehouse_ab_C.log 2>&1 || true
echo " C exit=$?"
# Run D — T3 enabled CLOUD (gpt-oss:120b), reads B+C lessons
echo "──── RUN D: T3 cloud, reads B+C lessons ────"
LH_OVERVIEW_CLOUD=1 bun tests/multi-agent/scenario.ts > /tmp/lakehouse_ab_D.log 2>&1 || true
echo " D exit=$?"
echo "▶ all runs done at $(date -Iseconds)"
echo "▶ scorecard:"
ls -1dt tests/multi-agent/playbooks/scenario-* | head -4 | tac | python3 -c "
import sys, os, json
runs = [l.strip() for l in sys.stdin if l.strip()]
labels = ['A(no-T3)','B(T3-seed)','C(T3-read)','D(T3-cloud)']
# Prepend Run A: most recent BEFORE the ab_t3_test kicked off is Run A
# (launched separately). But we only picked up the most recent 4 runs.
# Actually: ab_t3_test runs B/C/D, so recent 3 = B,C,D. Run A is the one
# BEFORE those — find it separately.
# Reread to include Run A:
import subprocess
all_runs = subprocess.check_output(['bash','-c','ls -1dt tests/multi-agent/playbooks/scenario-* | head -8']).decode().strip().split('\n')
# The 4 most recent are D, C, B, A (reverse chronological).
top4 = list(reversed(all_runs[:4])) # oldest first → A,B,C,D
rows = []
for i, path in enumerate(top4):
try:
results = json.load(open(os.path.join(path, 'results.json')))
except FileNotFoundError:
continue
ok = sum(1 for r in results if r.get('ok'))
turns = sum(r.get('turns', 0) for r in results)
gaps = sum(len(r.get('gap_signals', [])) for r in results)
cites = sum(len(r.get('playbook_citations') or []) for r in results)
prior = []
try:
prior = json.load(open(os.path.join(path, 'prior_lessons.json')))
except FileNotFoundError:
pass
rows.append({
'label': labels[i] if i < len(labels) else f'run{i}',
'path': path,
'ok_events': ok,
'total_events': len(results),
'total_turns': turns,
'total_gaps': gaps,
'total_citations': cites,
'prior_lessons_loaded': len(prior),
})
scorecard = {'generated_at': __import__('datetime').datetime.utcnow().isoformat()+'Z', 'runs': rows}
open('tests/multi-agent/playbooks/ab_scorecard.json','w').write(json.dumps(scorecard, indent=2))
print(json.dumps(scorecard, indent=2))
"
echo "▶ saved: tests/multi-agent/playbooks/ab_scorecard.json"

View File

@ -1,121 +0,0 @@
#!/usr/bin/env bun
/**
* Pass 2: matrix corpus + relevance threshold sweep.
*
* For each (corpus, threshold) combination, run codereview_matrix_only
* on the same N files. Compares which corpus actually adds grounded
* findings vs codereview_isolation (matrix-off baseline).
*
* Output: data/_kb/mode_experiments.jsonl gets one row per call,
* tagged via the force_matrix_corpus + force_relevance_threshold
* fields visible in `sources`. Aggregator can then group by corpus.
*
* Usage: bun run scripts/mode_pass2_corpus_sweep.ts
*/
const GATEWAY = process.env.LH_GATEWAY ?? "http://localhost:3100";
const MODEL = process.env.LH_MODEL ?? "openai/gpt-oss-120b:free";
const FILES = (process.env.LH_FILES ?? [
"crates/queryd/src/delta.rs",
"crates/queryd/src/service.rs",
"crates/vectord/src/pathway_memory.rs",
"crates/gateway/src/v1/mode.rs",
"crates/aibridge/src/client.rs",
].join(",")).split(",");
const CORPORA = (process.env.LH_CORPORA ?? [
"distilled_procedural_v20260423102847",
"distilled_factual_v20260423095819",
"distilled_config_hint_v20260423102847",
"kb_team_runs_v1",
].join(",")).split(",");
const THRESHOLDS = (process.env.LH_THRESHOLDS ?? "0.2,0.3,0.4,0.5").split(",").map(Number);
interface Result {
corpus: string;
threshold: number;
file: string;
ok: boolean;
matrix_kept?: number;
matrix_dropped?: number;
response_chars?: number;
latency_ms?: number;
error?: string;
}
async function runOne(corpus: string, threshold: number, file: string): Promise<Result> {
try {
const r = await fetch(`${GATEWAY}/v1/mode/execute`, {
method: "POST",
headers: { "content-type": "application/json" },
body: JSON.stringify({
task_class: "scrum_review",
file_path: file,
force_mode: "codereview_matrix_only",
force_model: MODEL,
force_matrix_corpus: corpus,
force_relevance_threshold: threshold,
}),
signal: AbortSignal.timeout(180_000),
});
if (!r.ok) {
const body = await r.text().catch(() => "");
return { corpus, threshold, file, ok: false, error: `HTTP ${r.status}: ${body.slice(0, 150)}` };
}
const j: any = await r.json();
return {
corpus, threshold, file, ok: true,
matrix_kept: j.sources?.matrix_chunks_kept,
matrix_dropped: j.sources?.matrix_chunks_dropped,
response_chars: (j.response ?? "").length,
latency_ms: j.latency_ms,
};
} catch (e: any) {
return { corpus, threshold, file, ok: false, error: e.message };
}
}
async function main() {
const total = CORPORA.length * THRESHOLDS.length * FILES.length;
console.log(`[pass2] corpora=${CORPORA.length} × thresholds=${THRESHOLDS.length} × files=${FILES.length} = ${total} runs`);
console.log(`[pass2] model=${MODEL}\n`);
let i = 0;
const results: Result[] = [];
for (const corpus of CORPORA) {
for (const threshold of THRESHOLDS) {
for (const file of FILES) {
i++;
process.stdout.write(` [${i}/${total}] corpus=${corpus.slice(0, 30).padEnd(30)} thr=${threshold.toFixed(1)} ${file.slice(-32).padStart(32)} ... `);
const r = await runOne(corpus, threshold, file);
results.push(r);
if (r.ok) {
const total_chunks = (r.matrix_kept ?? 0) + (r.matrix_dropped ?? 0);
console.log(`✓ k=${r.matrix_kept}/${total_chunks} resp=${r.response_chars} ${((r.latency_ms ?? 0) / 1000).toFixed(1)}s`);
} else {
console.log(`${r.error}`);
}
}
}
}
console.log(`\n[pass2] complete · ${results.filter(r => r.ok).length}/${results.length} succeeded`);
// Per-corpus×threshold roll-up of kept-rate (the matrix usefulness proxy).
console.log(`\n[pass2] kept-rate by corpus × threshold (avg chunks kept per call):`);
console.log(` ${"corpus".padEnd(40)} ${THRESHOLDS.map(t => `thr=${t.toFixed(1)}`).join(" ").padStart(35)}`);
for (const corpus of CORPORA) {
const cells = THRESHOLDS.map(t => {
const matched = results.filter(r => r.ok && r.corpus === corpus && r.threshold === t);
if (matched.length === 0) return " — ";
const avgKept = matched.reduce((s, r) => s + (r.matrix_kept ?? 0), 0) / matched.length;
return avgKept.toFixed(1).padStart(5);
}).join(" ");
console.log(` ${corpus.slice(0, 40).padEnd(40)} ${cells}`);
}
console.log(`\n[pass2] aggregate findings/groundedness with: bun run scripts/mode_compare.ts`);
}
main().catch(e => { console.error(e); process.exit(1); });

View File

@ -1,109 +0,0 @@
#!/usr/bin/env bun
/**
* Pass 3: variance test.
*
* Runs codereview_lakehouse on the SAME file N times at each of M
* temperatures. Measures run-to-run stability of grounded finding
* count, response size, and latency. Anything <100% groundedness
* is a leak; track which symbols got hallucinated.
*
* Output appends to data/_kb/mode_experiments.jsonl. The aggregator
* can group by ts and identify variance buckets.
*
* Usage: bun run scripts/mode_pass3_variance.ts
*/
const GATEWAY = process.env.LH_GATEWAY ?? "http://localhost:3100";
const MODEL = process.env.LH_MODEL ?? "openai/gpt-oss-120b:free";
const FILES = (process.env.LH_FILES ?? [
"crates/queryd/src/delta.rs",
"crates/vectord/src/pathway_memory.rs",
"crates/gateway/src/v1/mode.rs",
].join(",")).split(",");
const TEMPS = (process.env.LH_TEMPS ?? "0.0,0.1,0.3").split(",").map(Number);
const REPS = Number(process.env.LH_REPS ?? 5);
interface Result {
file: string;
temp: number;
rep: number;
ok: boolean;
response_chars?: number;
latency_ms?: number;
error?: string;
}
async function runOne(file: string, temp: number, rep: number): Promise<Result> {
try {
const r = await fetch(`${GATEWAY}/v1/mode/execute`, {
method: "POST",
headers: { "content-type": "application/json" },
body: JSON.stringify({
task_class: "scrum_review",
file_path: file,
force_mode: "codereview_lakehouse",
force_model: MODEL,
force_temperature: temp,
}),
signal: AbortSignal.timeout(180_000),
});
if (!r.ok) {
const body = await r.text().catch(() => "");
return { file, temp, rep, ok: false, error: `HTTP ${r.status}: ${body.slice(0, 150)}` };
}
const j: any = await r.json();
return {
file, temp, rep, ok: true,
response_chars: (j.response ?? "").length,
latency_ms: j.latency_ms,
};
} catch (e: any) {
return { file, temp, rep, ok: false, error: e.message };
}
}
async function main() {
const total = FILES.length * TEMPS.length * REPS;
console.log(`[pass3] files=${FILES.length} × temps=${TEMPS.length} × reps=${REPS} = ${total} runs`);
console.log(`[pass3] model=${MODEL}\n`);
let i = 0;
const results: Result[] = [];
for (const file of FILES) {
for (const temp of TEMPS) {
for (let rep = 1; rep <= REPS; rep++) {
i++;
process.stdout.write(` [${i}/${total}] temp=${temp.toFixed(1)} rep=${rep}/${REPS} ${file.slice(-32).padStart(32)} ... `);
const r = await runOne(file, temp, rep);
results.push(r);
if (r.ok) {
console.log(`✓ resp=${r.response_chars} ${((r.latency_ms ?? 0) / 1000).toFixed(1)}s`);
} else {
console.log(`${r.error}`);
}
}
}
}
console.log(`\n[pass3] complete · ${results.filter(r => r.ok).length}/${results.length} succeeded`);
// Per-file × temp variance summary (response_chars stddev as a quick
// proxy for output instability).
console.log(`\n[pass3] response_chars variance (mean ± stddev) by file × temp:`);
console.log(` ${"file".padEnd(40)} ${TEMPS.map(t => `temp=${t.toFixed(1)}`.padStart(20)).join(" ")}`);
for (const file of FILES) {
const cells = TEMPS.map(t => {
const xs = results.filter(r => r.ok && r.file === file && r.temp === t).map(r => r.response_chars ?? 0);
if (xs.length === 0) return " — ";
const mean = xs.reduce((s, x) => s + x, 0) / xs.length;
const sd = Math.sqrt(xs.reduce((s, x) => s + Math.pow(x - mean, 2), 0) / xs.length);
return `${Math.round(mean).toString().padStart(7)} ± ${Math.round(sd).toString().padEnd(6)}`.padStart(20);
}).join(" ");
console.log(` ${file.slice(0, 40).padEnd(40)} ${cells}`);
}
console.log(`\n[pass3] grounding variance via: bun run scripts/mode_compare.ts (look for grounded-N column drift)`);
}
main().catch(e => { console.error(e); process.exit(1); });

View File

@ -1,127 +0,0 @@
#!/usr/bin/env bun
/**
* Pass 4: staffing_inference_lakehouse cross-domain validation.
*
* Runs the staffing-domain mode against synthetic fill requests.
* Validates that the modes-as-prompt-molders architecture generalizes
* beyond code review the composer pattern (file_content + bug
* fingerprints + relevance-filtered matrix + domain framing) should
* produce grounded staffing recommendations the same way it produces
* grounded code reviews.
*
* Each fill request is posted as `file_content` (since the runner's
* shape expects file content; for staffing it's the request payload).
* file_path is set to a synthetic path under requests/ so pathway
* memory bucketing groups requests by geo+role.
*
* Usage: bun run scripts/mode_pass4_staffing.ts
*/
const GATEWAY = process.env.LH_GATEWAY ?? "http://localhost:3100";
const MODEL = process.env.LH_MODEL ?? "openai/gpt-oss-120b:free";
interface FillRequest {
city: string;
state: string;
role: string;
count: number;
deadline: string;
notes?: string;
}
const REQUESTS: FillRequest[] = [
{ city: "Toledo", state: "OH", role: "Welder", count: 2, deadline: "2026-04-29", notes: "OSHA 10 required" },
{ city: "Nashville", state: "TN", role: "Forklift Operator", count: 3, deadline: "2026-05-01" },
{ city: "Chicago", state: "IL", role: "Assembler", count: 5, deadline: "2026-04-30", notes: "second shift" },
{ city: "South Bend", state: "IN", role: "Electrician", count: 1, deadline: "2026-04-28", notes: "journeyman license" },
{ city: "Murfreesboro", state: "TN", role: "Packaging Operator", count: 4, deadline: "2026-05-02" },
];
function requestToPayload(req: FillRequest): string {
return [
`# Fill Request`,
`Role: ${req.role} × ${req.count}`,
`Location: ${req.city}, ${req.state}`,
`Deadline: ${req.deadline}`,
req.notes ? `Notes: ${req.notes}` : "",
"",
"Recommend candidates from the matrix data. Cite playbook references.",
].filter(Boolean).join("\n");
}
interface Result {
req: FillRequest;
ok: boolean;
response_chars?: number;
bug_fingerprints?: number;
matrix_kept?: number;
matrix_dropped?: number;
latency_ms?: number;
error?: string;
preview?: string;
}
async function runOne(req: FillRequest): Promise<Result> {
const payload = requestToPayload(req);
const file_path = `requests/${req.role.toLowerCase().replace(/\s+/g, "_")}_${req.city.toLowerCase().replace(/\s+/g, "_")}_${req.state}.md`;
try {
const r = await fetch(`${GATEWAY}/v1/mode/execute`, {
method: "POST",
headers: { "content-type": "application/json" },
body: JSON.stringify({
task_class: "staffing_inference",
file_path,
file_content: payload,
force_mode: "staffing_inference_lakehouse",
force_model: MODEL,
}),
signal: AbortSignal.timeout(180_000),
});
if (!r.ok) {
const body = await r.text().catch(() => "");
return { req, ok: false, error: `HTTP ${r.status}: ${body.slice(0, 200)}` };
}
const j: any = await r.json();
return {
req, ok: true,
response_chars: (j.response ?? "").length,
bug_fingerprints: j.sources?.bug_fingerprints_count,
matrix_kept: j.sources?.matrix_chunks_kept,
matrix_dropped: j.sources?.matrix_chunks_dropped,
latency_ms: j.latency_ms,
preview: (j.response ?? "").slice(0, 400),
};
} catch (e: any) {
return { req, ok: false, error: e.message };
}
}
async function main() {
console.log(`[pass4] requests=${REQUESTS.length} model=${MODEL} mode=staffing_inference_lakehouse\n`);
let i = 0;
const results: Result[] = [];
for (const req of REQUESTS) {
i++;
process.stdout.write(` [${i}/${REQUESTS.length}] ${req.role.padEnd(22)} × ${req.count} in ${req.city}, ${req.state} ... `);
const r = await runOne(req);
results.push(r);
if (r.ok) {
console.log(`✓ resp=${r.response_chars} bug=${r.bug_fingerprints ?? 0} mtx=${r.matrix_kept ?? 0}/${(r.matrix_kept ?? 0) + (r.matrix_dropped ?? 0)} ${((r.latency_ms ?? 0) / 1000).toFixed(1)}s`);
} else {
console.log(`${r.error}`);
}
}
console.log(`\n[pass4] complete · ${results.filter(r => r.ok).length}/${results.length} succeeded\n`);
// Show first successful response head to verify the framing actually
// produced staffing-style output (verdict + ranked candidates) not
// generic prose.
const first = results.find(r => r.ok && r.preview);
if (first) {
console.log(`[pass4] first successful response preview (${first.req.city} ${first.req.role}):`);
console.log(first.preview!.split("\n").map(l => " | " + l).join("\n"));
}
}
main().catch(e => { console.error(e); process.exit(1); });

View File

@ -1,169 +0,0 @@
#!/usr/bin/env bun
/**
* Pass 5 variance summarizer. Reads data/_kb/mode_experiments.jsonl
* since a timestamp, groups by (mode|corpus), reports mean ± stddev
* of grounded finding count, plus a head-to-head wins/losses table
* vs the isolation baseline.
*
* Usage:
* bun run scripts/mode_pass5_summarize.ts # default 2h
* bun run scripts/mode_pass5_summarize.ts --since 2026-04-26T22 # explicit
*/
import { readFileSync, existsSync } from "node:fs";
const argSince = (() => {
const i = Bun.argv.indexOf("--since");
return i >= 0 ? Bun.argv[i + 1] : new Date(Date.now() - 2 * 60 * 60 * 1000).toISOString();
})();
const JSONL = "data/_kb/mode_experiments.jsonl";
if (!existsSync(JSONL)) { console.error(`no ${JSONL}`); process.exit(1); }
interface Row {
ts: string; mode: string; file_path: string; response: string;
sources: { matrix_corpus?: string | string[] | null };
latency_ms: number;
}
function corpusKey(c: any): string {
if (!c) return "";
if (typeof c === "string") return c;
if (Array.isArray(c)) return c.length === 0 ? "" : [...c].sort().join("+");
return "";
}
const condKey = (r: Row) => {
const c = corpusKey(r.sources?.matrix_corpus);
return c ? `${r.mode}|${c}` : r.mode;
};
// Reuse the same grounding logic as mode_compare — symbols cited in
// findings rows must appear in the focus file, and any line numbers
// must fall within EOF.
function extractFindings(md: string): { symbols: string[]; lines: number[] }[] {
const sec = /(?:^|\n)#{1,3}[^\na-zA-Z]*(?:Ranked\s+)?Findings?[^\n]*\n/i;
const m = md.match(sec);
let section = md;
if (m && m.index !== undefined) {
const after = md.slice(m.index + m[0].length);
const stop = after.search(/\n#{1,3}[^\na-zA-Z]*(?:Patch|Suggestion|Reference|Summary|Concrete)/i);
section = stop >= 0 ? after.slice(0, stop) : after;
}
// Three row shapes:
// 1) numbered: `| 1 | ... |`
// 2) path-with-line: `| service.rs:106 | ... |`
// 3) path-with-sym: `| crates/vectord/src/pathway_memory.rs:load_fn (≈L220) | ... |`
// Pick whichever shape matches the most rows (ties favor numbered).
const numbered = section.split("\n").filter(l => /^\|\s*\*?\*?\d+\*?\*?\s*\|/.test(l));
const pathRows = section.split("\n").filter(l => /^\|\s*[a-z_/\.][a-z_/\.0-9]*\.(rs|ts|py)\b/i.test(l));
const rows = numbered.length >= pathRows.length ? numbered : pathRows;
return rows.map(row => {
const sym = new Set<string>();
for (const t of row.matchAll(/`([A-Za-z_][A-Za-z0-9_:]*)`/g)) sym.add(t[1]);
for (const t of row.matchAll(/\b([a-z][a-z0-9_]{4,})\b/g)) sym.add(t[1]);
const lines: number[] = [];
for (const t of row.matchAll(/[:\-](\d{2,5})/g)) lines.push(parseInt(t[1]));
return { symbols: [...sym], lines };
});
}
function grounded(md: string, file: string): { total: number; grounded: number; oob: number } {
const content = readFileSync(file, "utf8");
const eof = content.split("\n").length;
const findings = extractFindings(md);
let g = 0, oob = 0;
for (const f of findings) {
const symHit = f.symbols.length > 0 && f.symbols.some(s => content.includes(s));
const lineOob = f.lines.length > 0 && f.lines.some(l => l > eof);
if (lineOob) oob++;
if (symHit && !lineOob) g++;
}
return { total: findings.length, grounded: g, oob };
}
const lines = readFileSync(JSONL, "utf8").split("\n").filter(Boolean);
const rows: Row[] = [];
for (const l of lines) {
try {
const r: Row = JSON.parse(l);
if (r.ts < argSince) continue;
rows.push(r);
} catch {}
}
if (rows.length === 0) { console.error(`no rows since ${argSince}`); process.exit(1); }
// Group: condition → file → array of grounded counts
type CellArr = { grnd: number[]; total: number[]; oob: number[]; ms: number[] };
const byCond: Record<string, Record<string, CellArr>> = {};
for (const r of rows) {
const k = condKey(r);
byCond[k] ??= {};
byCond[k][r.file_path] ??= { grnd: [], total: [], oob: [], ms: [] };
const g = grounded(r.response, r.file_path);
byCond[k][r.file_path].grnd.push(g.grounded);
byCond[k][r.file_path].total.push(g.total);
byCond[k][r.file_path].oob.push(g.oob);
byCond[k][r.file_path].ms.push(r.latency_ms);
}
function stats(xs: number[]): { n: number; mean: number; sd: number; min: number; max: number } {
const n = xs.length;
if (n === 0) return { n: 0, mean: 0, sd: 0, min: 0, max: 0 };
const mean = xs.reduce((s, x) => s + x, 0) / n;
const variance = n === 1 ? 0 : xs.reduce((s, x) => s + (x - mean) ** 2, 0) / (n - 1);
return { n, mean, sd: Math.sqrt(variance), min: Math.min(...xs), max: Math.max(...xs) };
}
const conditions = Object.keys(byCond).sort();
const files = [...new Set(rows.map(r => r.file_path))].sort();
console.log(`\n═══ Pass 5 variance — since ${argSince} ═══\n`);
console.log(` ${rows.length} rows · ${conditions.length} conditions · ${files.length} files\n`);
for (const file of files) {
console.log(`📄 ${file}`);
console.log(` ${"condition".padEnd(56)} n ${"grounded mean ± sd".padStart(20)} ${"range".padStart(8)} ${"oob".padStart(4)} ${"avg ms".padStart(7)}`);
console.log(` ${"─".repeat(56)} ─── ${"─".repeat(20)} ${"─".repeat(8)} ${"─".repeat(4)} ${"─".repeat(7)}`);
for (const c of conditions) {
const cell = byCond[c]?.[file];
if (!cell || cell.grnd.length === 0) continue;
const s = stats(cell.grnd);
const oobSum = cell.oob.reduce((a, b) => a + b, 0);
const msMean = cell.ms.reduce((a, b) => a + b, 0) / cell.ms.length;
const meanSd = `${s.mean.toFixed(1)} ± ${s.sd.toFixed(1)}`;
const range = `[${s.min}-${s.max}]`;
console.log(` ${c.padEnd(56)} ${String(s.n).padStart(3)} ${meanSd.padStart(20)} ${range.padStart(8)} ${String(oobSum).padStart(4)} ${Math.round(msMean / 1000).toString().padStart(5)}s`);
}
console.log("");
}
// Head-to-head: for each condition vs isolation baseline, count rep-by-rep
// wins across the same file. Requires equal rep counts.
console.log(`═══ Head-to-head: each condition vs isolation, rep-by-rep ═══\n`);
const isoKey = conditions.find(c => c.startsWith("codereview_isolation"));
if (!isoKey) {
console.log(" no isolation rows in window");
} else {
console.log(` baseline: ${isoKey}\n`);
console.log(` ${"challenger".padEnd(56)} wins losses ties Δ mean grnd`);
console.log(` ${"─".repeat(56)} ${"─".repeat(4)} ${"─".repeat(6)} ${"─".repeat(4)} ${"─".repeat(12)}`);
for (const c of conditions) {
if (c === isoKey) continue;
let wins = 0, losses = 0, ties = 0, deltaSum = 0, n = 0;
for (const file of files) {
const isoArr = byCond[isoKey]?.[file]?.grnd ?? [];
const cArr = byCond[c]?.[file]?.grnd ?? [];
const k = Math.min(isoArr.length, cArr.length);
for (let i = 0; i < k; i++) {
if (cArr[i] > isoArr[i]) wins++;
else if (cArr[i] < isoArr[i]) losses++;
else ties++;
deltaSum += cArr[i] - isoArr[i];
n++;
}
}
const dMean = n > 0 ? (deltaSum / n).toFixed(2) : "—";
console.log(` ${c.padEnd(56)} ${String(wins).padStart(4)} ${String(losses).padStart(6)} ${String(ties).padStart(4)} ${dMean.padStart(12)}`);
}
}

View File

@ -1,105 +0,0 @@
#!/usr/bin/env bun
/**
* Pass 5: variance test for the 2026-04-26 paid-model bake-off.
*
* The pass-4 single-rep sweep showed isolation beating every matrix
* condition by 1.0-1.4 grounded findings/file on grok-4.1-fast. This
* harness runs N reps × M conditions on the file where the effect was
* sharpest (pathway_memory.rs, 1355 lines) so we can decide whether
* the deltas are real signal or run-to-run noise.
*
* Conditions:
* 1. codereview_isolation no matrix
* 2. codereview_lakehouse + corpus=lakehouse_arch_v1 A only
* 3. codereview_lakehouse + corpus=lakehouse_symbols_v1 C only
* 4. codereview_lakehouse (modes.toml default) A+C composed
*
* Output appends per-call to data/_kb/mode_experiments.jsonl. Aggregate
* with `bun run scripts/mode_compare.ts --since <ts>` and read the
* grounded column with multiple rows per (mode|corpus) key.
*
* Usage:
* bun run scripts/mode_pass5_variance_paid.ts
* LH_REPS=3 LH_FILE=crates/queryd/src/delta.rs bun run scripts/mode_pass5_variance_paid.ts
*/
const GATEWAY = process.env.LH_GATEWAY ?? "http://localhost:3100";
const MODEL = process.env.LH_MODEL ?? "x-ai/grok-4.1-fast";
const FILE = process.env.LH_FILE ?? "crates/vectord/src/pathway_memory.rs";
const REPS = Number(process.env.LH_REPS ?? 5);
interface Condition {
label: string;
mode: string;
corpus?: string | string[];
}
const ALL_CONDITIONS: Condition[] = [
{ label: "isolation ", mode: "codereview_isolation" },
{ label: "arch_only ", mode: "codereview_lakehouse", corpus: "lakehouse_arch_v1" },
{ label: "symbols_only ", mode: "codereview_lakehouse", corpus: "lakehouse_symbols_v1" },
{ label: "composed (A+C) ", mode: "codereview_lakehouse" /* uses modes.toml default */ },
];
// Optional whitelist via env: LH_CONDITIONS=isolation,composed limits the
// run to a subset (matches against the trimmed `label`). Useful when only
// the head-to-head pair matters and saves ~50% latency on slow rungs.
const wantedLabels = (process.env.LH_CONDITIONS ?? "")
.split(",").map(s => s.trim().toLowerCase()).filter(Boolean);
const CONDITIONS: Condition[] = wantedLabels.length === 0
? ALL_CONDITIONS
: ALL_CONDITIONS.filter(c => wantedLabels.some(w => c.label.trim().toLowerCase().startsWith(w)));
async function runOne(c: Condition, rep: number): Promise<{ ok: boolean; latency_ms?: number; resp_chars?: number; error?: string }> {
const body: any = {
task_class: "scrum_review",
file_path: FILE,
force_mode: c.mode,
force_model: MODEL,
};
if (c.corpus !== undefined) body.force_matrix_corpus = c.corpus;
try {
const r = await fetch(`${GATEWAY}/v1/mode/execute`, {
method: "POST",
headers: { "content-type": "application/json" },
body: JSON.stringify(body),
signal: AbortSignal.timeout(240_000),
});
if (!r.ok) {
const txt = await r.text().catch(() => "");
return { ok: false, error: `HTTP ${r.status}: ${txt.slice(0, 160)}` };
}
const j: any = await r.json();
return { ok: true, latency_ms: j.latency_ms, resp_chars: (j.response ?? "").length };
} catch (e: any) {
return { ok: false, error: e.message };
}
}
async function main() {
const total = CONDITIONS.length * REPS;
console.log(`[pass5] file=${FILE}`);
console.log(`[pass5] model=${MODEL} · ${CONDITIONS.length} conditions × ${REPS} reps = ${total} runs`);
console.log("");
let i = 0;
const startTs = new Date().toISOString();
for (let rep = 1; rep <= REPS; rep++) {
for (const c of CONDITIONS) {
i++;
process.stdout.write(` [${i}/${total}] rep=${rep} ${c.label}... `);
const r = await runOne(c, rep);
if (r.ok) {
console.log(`${r.resp_chars} chars · ${((r.latency_ms ?? 0) / 1000).toFixed(1)}s`);
} else {
console.log(`${r.error}`);
}
}
}
console.log(`\n[pass5] complete · started ${startTs}`);
console.log(`[pass5] aggregate: bun run scripts/mode_compare.ts --since ${startTs}`);
}
main().catch(e => { console.error(e); process.exit(1); });

View File

@ -1,458 +0,0 @@
#!/bin/bash
# OVERNIGHT PROOF — the test that settles it
# Runs unattended: embed 500K, build indexes, measure recall,
# autonomous agent test, sustained load. ~3 hours total.
#
# Monitor: tail -f /home/profit/lakehouse/logs/overnight_proof.log
set -uo pipefail
LOG="/home/profit/lakehouse/logs/overnight_proof.log"
STATE="/tmp/overnight_proof_state"
LOCK="/tmp/overnight_proof.lock"
LH="http://localhost:3100"
GW="http://localhost:3700"
mkdir -p /home/profit/lakehouse/logs
if [ -f "$LOCK" ] && kill -0 "$(cat $LOCK)" 2>/dev/null; then
echo "$(date) Already running" >> "$LOG"
exit 0
fi
echo $$ > "$LOCK"
trap "rm -f $LOCK" EXIT
log() { echo "$(date '+%H:%M:%S') $1" | tee -a "$LOG"; }
touch "$STATE"
step=$(cat "$STATE" 2>/dev/null || echo "embed")
log "═══ OVERNIGHT PROOF: step=$step ═══"
case "$step" in
embed)
log "STEP 1/5: Embedding 500K workers through Ollama (~40 min)"
log " This is the real test — actual nomic-embed-text embeddings, not random vectors"
python3 << 'PYEOF' >> "$LOG" 2>&1
import json, time, sys
from urllib.request import Request, urlopen
from urllib.error import HTTPError
LH = "http://localhost:3100"
def post(path, body, timeout=300):
r = Request(f"{LH}{path}", json.dumps(body).encode(), headers={"Content-Type": "application/json"})
try: return json.loads(urlopen(r, timeout=timeout).read())
except HTTPError as e: return {"error": e.read().decode()[:200]}
except Exception as e: return {"error": str(e)}
# Fetch 500K resume_text for embedding
print("Fetching resume texts from workers_500k...")
r = post("/query/sql", {"sql": "SELECT worker_id, resume_text FROM workers_500k LIMIT 500000"})
if "error" in r:
print(f"SQL error: {r['error']}")
sys.exit(1)
rows = r.get("rows", [])
print(f"Got {len(rows)} rows")
# Build docs for embedding
docs = []
for row in rows:
wid = row.get("worker_id", "")
text = row.get("resume_text", "")
if text and len(text) > 20:
docs.append({"id": f"W500K-{wid}", "text": text})
print(f"{len(docs)} docs ready for embedding")
# Chunk into batches of 50K to avoid timeout issues
BATCH = 50000
for batch_start in range(0, len(docs), BATCH):
batch = docs[batch_start:batch_start + BATCH]
batch_num = batch_start // BATCH + 1
total_batches = (len(docs) + BATCH - 1) // BATCH
idx_name = f"workers_500k_v{batch_num}"
print(f"\nBatch {batch_num}/{total_batches}: {len(batch)} docs → index '{idx_name}'")
t0 = time.time()
r = post("/vectors/index", {
"index_name": idx_name,
"source": "workers_500k",
"documents": batch,
"chunk_size": 500,
"overlap": 50,
}, timeout=600)
if "error" in r:
print(f" Index creation error: {r['error']}")
continue
job_id = r.get("job_id")
chunks = r.get("chunks", 0)
print(f" Job {job_id}: {chunks} chunks, embedding in background...")
# Wait for this batch to complete
for _ in range(600): # 50 min max per batch
time.sleep(5)
status = post(f"/vectors/jobs/{job_id}", None) if job_id else {"status": "unknown"}
if isinstance(status, dict):
state = status.get("status", "unknown")
progress = status.get("embedded_chunks", 0)
if state == "completed":
elapsed = time.time() - t0
rate = chunks / elapsed if elapsed > 0 else 0
print(f" DONE: {chunks} chunks in {elapsed:.0f}s ({rate:.0f}/sec)")
break
elif state == "failed":
print(f" FAILED: {status.get('error', 'unknown')}")
break
sys.stdout.write(f"\r {state}: {progress}/{chunks} chunks...")
sys.stdout.flush()
print()
print("\nAll batches submitted. Checking indexes...")
r = post("/vectors/indexes", None)
if not isinstance(r, list): r = []
for idx in r:
if "500k" in idx.get("index_name", ""):
print(f" {idx['index_name']}: {idx['chunk_count']} chunks")
print("STEP 1 COMPLETE")
PYEOF
if grep -q "STEP 1 COMPLETE" "$LOG"; then
echo "build_indexes" > "$STATE"
log "Embedding complete — moving to index build"
else
log "Embedding may still be running — will check on next heartbeat"
echo "check_embed" > "$STATE"
fi
;;
check_embed)
log "Checking embedding job status..."
python3 -c "
import json
from urllib.request import urlopen
r = json.loads(urlopen('http://localhost:3100/vectors/jobs', timeout=30).read())
running = [j for j in r if j.get('status') == 'running']
completed = [j for j in r if j.get('status') == 'completed' and '500k' in j.get('index_name','')]
print(f'Running: {len(running)}, Completed 500K: {len(completed)}')
if not running:
print('ALL_DONE')
" >> "$LOG" 2>&1
if grep -q "ALL_DONE" "$LOG"; then
echo "build_indexes" > "$STATE"
fi
;;
build_indexes)
log "STEP 2/5: Building HNSW + Lance on real 500K embeddings"
python3 << 'PYEOF' >> "$LOG" 2>&1
import json, time
from urllib.request import Request, urlopen
LH = "http://localhost:3100"
def post(path, body, timeout=600):
r = Request(f"{LH}{path}", json.dumps(body).encode(), headers={"Content-Type": "application/json"})
return json.loads(urlopen(r, timeout=timeout).read())
# Find the first 500K index
indexes = json.loads(urlopen(f"{LH}/vectors/indexes", timeout=30).read())
idx_500k = [i for i in indexes if "500k" in i.get("index_name","")]
if not idx_500k:
print("No 500K index found — embedding may not be complete")
exit(1)
idx_name = idx_500k[0]["index_name"]
chunks = idx_500k[0]["chunk_count"]
print(f"Using index: {idx_name} ({chunks} chunks)")
# Build HNSW
print(f"Building HNSW on {chunks} real embeddings...")
t0 = time.time()
r = post("/vectors/hnsw/build", {"index_name": idx_name})
print(f" HNSW: {r.get('vectors',0)} vectors in {time.time()-t0:.0f}s")
# Migrate to Lance
print(f"Migrating to Lance...")
t0 = time.time()
r = post(f"/vectors/lance/migrate/{idx_name}", {})
stats = r.get("stats", {})
print(f" Lance: {stats.get('rows_written',0)} rows in {stats.get('duration_secs',0):.1f}s")
# Build IVF_PQ on Lance
# sqrt(50K) ≈ 224 partitions for a 50K batch
print(f"Building IVF_PQ on Lance...")
t0 = time.time()
r = post(f"/vectors/lance/index/{idx_name}", {"num_partitions": 224, "num_bits": 8, "num_sub_vectors": 192})
print(f" IVF_PQ: built in {r.get('build_time_secs',0):.0f}s")
# Build scalar btree
print(f"Building scalar btree on doc_id...")
r = post(f"/vectors/lance/scalar-index/{idx_name}/doc_id", {})
print(f" Btree: built in {r.get('build_time_secs',0):.1f}s")
print("STEP 2 COMPLETE")
PYEOF
if grep -q "STEP 2 COMPLETE" "$LOG"; then
echo "recall_test" > "$STATE"
fi
;;
recall_test)
log "STEP 3/5: Measuring recall on REAL embeddings"
python3 << 'PYEOF' >> "$LOG" 2>&1
import json, time
from urllib.request import Request, urlopen
LH = "http://localhost:3100"
def post(path, body, timeout=300):
r = Request(f"{LH}{path}", json.dumps(body).encode(), headers={"Content-Type": "application/json"})
return json.loads(urlopen(r, timeout=timeout).read())
# Find 500K index
indexes = json.loads(urlopen(f"{LH}/vectors/indexes", timeout=30).read())
idx_500k = [i for i in indexes if "500k" in i.get("index_name","")]
if not idx_500k:
print("No 500K index — skipping recall")
exit(0)
idx_name = idx_500k[0]["index_name"]
# Auto-generate eval harness
print(f"Generating eval harness for {idx_name}...")
r = post(f"/vectors/hnsw/evals/{idx_name}_recall/autogen", {
"index_name": idx_name, "sample_count": 50, "k": 10,
})
print(f" Harness: {len(r.get('queries',[]))} queries, k={r.get('k',10)}")
# HNSW recall
print("Measuring HNSW recall...")
r = post("/vectors/hnsw/trial", {
"index_name": idx_name,
"harness": f"{idx_name}_recall",
"config": {"ef_construction": 80, "ef_search": 30, "seed": 42},
})
print(f" HNSW recall@10: {r.get('metrics',{}).get('recall_at_k',0):.4f}")
print(f" HNSW p50: {r.get('metrics',{}).get('search_latency_p50_us',0):.0f}us")
# Lance recall
print("Measuring Lance IVF_PQ recall...")
r = post(f"/vectors/lance/recall/{idx_name}", {
"harness": f"{idx_name}_recall", "top_k": 10,
})
print(f" Lance recall@10: {r.get('mean_recall',0):.4f}")
print(f" Lance p50: {r.get('latency_p50_us',0):.0f}us")
print("STEP 3 COMPLETE")
PYEOF
if grep -q "STEP 3 COMPLETE" "$LOG"; then
echo "autonomous_test" > "$STATE"
fi
;;
autonomous_test)
log "STEP 4/5: 100 staffing questions — LOCAL MODEL ONLY, no human steering"
python3 << 'PYEOF' >> "$LOG" 2>&1
import json, time, random
from urllib.request import Request, urlopen
from urllib.error import HTTPError
GW = "http://localhost:3700"
LH = "http://localhost:3100"
random.seed(2026)
def gw(path, body=None, timeout=180):
data = json.dumps(body).encode() if body else None
method = "POST" if body else "GET"
r = Request(f"{GW}{path}", data=data, method=method, headers={"Content-Type":"application/json"} if body else {})
try: return json.loads(urlopen(r, timeout=timeout).read())
except HTTPError as e: return {"error": e.read().decode()[:200]}
except Exception as e: return {"error": str(e)}
def sql(query):
r = gw("/sql", {"sql": query})
return r.get("rows", []) if "error" not in r else []
ROLES = ["Forklift Operator","Machine Operator","Assembler","Loader","Quality Tech",
"Welder","Sanitation Worker","Shipping Clerk","Production Worker","Maintenance Tech"]
STATES = ["IL","IN","OH","MO","TN","KY","WI","MI"]
print("═══ 100 AUTONOMOUS OPERATIONS ═══")
passed = 0
failed = 0
total_ms = 0
# Mix of operation types
for i in range(100):
op_type = random.choices(["match","count","aggregate","lookup"], weights=[50,25,15,10])[0]
role = random.choice(ROLES)
state = random.choice(STATES)
rel = round(random.uniform(0.6, 0.9), 2)
t0 = time.time()
ok = False
detail = ""
if op_type == "match":
r = gw("/search", {
"question": f"Find {role} workers in {state}",
"sql_filter": f"role = '{role}' AND state = '{state}' AND CAST(reliability AS DOUBLE) >= {rel}",
"dataset": "workers_500k", "top_k": 5, "generate": False,
})
matched = len(r.get("sources", []))
ok = matched > 0 or r.get("sql_matches", 0) == 0 # 0 matches is ok if SQL found 0
detail = f"match: {matched} results (sql={r.get('sql_matches',0)})"
elif op_type == "count":
truth = sql(f"SELECT COUNT(*) cnt FROM workers_500k WHERE role = '{role}' AND state = '{state}'")
expected = truth[0]["cnt"] if truth else 0
# Use keyword classifier logic: count → SQL
r = gw("/sql", {"sql": f"SELECT COUNT(*) cnt FROM workers_500k WHERE role = '{role}' AND state = '{state}'"})
got = r.get("rows", [{}])[0].get("cnt", -1) if "error" not in r else -1
ok = got == expected
detail = f"count: got={got} expected={expected}"
elif op_type == "aggregate":
r = gw("/sql", {"sql": f"SELECT ROUND(AVG(CAST(reliability AS DOUBLE)),3) avg FROM workers_500k WHERE role = '{role}' AND state = '{state}'"})
ok = "error" not in r and r.get("rows")
detail = f"aggregate: {r.get('rows',[{}])[0] if ok else r.get('error','?')[:40]}"
elif op_type == "lookup":
wid = random.randint(1, 500000)
r = gw(f"/worker/{wid}")
ok = r.get("rows") and len(r["rows"]) > 0
detail = f"lookup: worker {wid} {'found' if ok else 'not found'}"
ms = (time.time()-t0)*1000
total_ms += ms
if ok: passed += 1
else: failed += 1
if i % 20 == 0 or not ok:
icon = "OK" if ok else "FAIL"
print(f" [{i+1:3d}/100] {icon} {op_type:10s} {detail[:50]:50s} ({ms:.0f}ms)")
pct = passed / 100 * 100
print(f"\n═══ RESULT: {passed}/100 passed ({pct:.0f}%) in {total_ms/1000:.1f}s ═══")
print(f" avg latency: {total_ms/100:.0f}ms per operation")
# Log to playbook
gw("/log", {
"operation": f"autonomous_100: {passed}/100 ({pct:.0f}%)",
"approach": "keyword routing + SQL + hybrid, local model only",
"result": f"passed={passed} failed={failed} avg_ms={total_ms/100:.0f}",
"context": "overnight proof step 4",
})
if pct >= 90:
print("STEP 4 COMPLETE — AUTONOMOUS TEST PASSED")
else:
print(f"STEP 4 COMPLETE — {pct:.0f}% (below 90% target)")
PYEOF
echo "sustained_load" > "$STATE"
;;
sustained_load)
log "STEP 5/5: Sustained load — 30 minutes of continuous operations"
python3 << 'PYEOF' >> "$LOG" 2>&1
import json, time, random, concurrent.futures
from urllib.request import Request, urlopen
from urllib.error import HTTPError
GW = "http://localhost:3700"
random.seed(42)
def gw(path, body=None):
data = json.dumps(body).encode() if body else None
r = Request(f"{GW}{path}", data=data, method="POST" if body else "GET",
headers={"Content-Type":"application/json"} if body else {})
try: return json.loads(urlopen(r, timeout=60).read())
except: return {"error": "timeout"}
ROLES = ["Forklift Operator","Machine Operator","Assembler","Loader","Quality Tech"]
STATES = ["IL","IN","OH","MO"]
print("═══ SUSTAINED LOAD: 30 minutes ═══")
duration = 30 * 60 # 30 minutes
t_start = time.time()
ops = 0
errors = 0
cycle = 0
while time.time() - t_start < duration:
cycle += 1
batch_ops = 0
batch_errors = 0
# Fire 10 concurrent operations
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as pool:
futures = []
for _ in range(10):
role = random.choice(ROLES)
state = random.choice(STATES)
futures.append(pool.submit(gw, "/sql", {
"sql": f"SELECT COUNT(*) FROM workers_500k WHERE role = '{role}' AND state = '{state}'"
}))
for f in concurrent.futures.as_completed(futures):
r = f.result()
batch_ops += 1
if "error" in r: batch_errors += 1
ops += batch_ops
errors += batch_errors
elapsed = time.time() - t_start
remaining = duration - elapsed
if cycle % 30 == 0: # Log every ~30 cycles
rate = ops / elapsed
print(f" {elapsed/60:.0f}min: {ops} ops ({rate:.0f}/sec) errors={errors} remaining={remaining/60:.0f}min")
time.sleep(1) # 1 sec between batches
elapsed = time.time() - t_start
rate = ops / elapsed
print(f"\n═══ SUSTAINED LOAD COMPLETE ═══")
print(f" Duration: {elapsed/60:.1f} minutes")
print(f" Operations: {ops}")
print(f" Rate: {rate:.0f} ops/sec")
print(f" Errors: {errors} ({100*errors/max(ops,1):.1f}%)")
print(f" STEP 5 COMPLETE")
PYEOF
echo "report" > "$STATE"
;;
report)
log ""
log "═══════════════════════════════════════════════════════"
log " OVERNIGHT PROOF — COMPLETE"
log "═══════════════════════════════════════════════════════"
log " Step 1: 500K real embeddings via Ollama"
log " Step 2: HNSW + Lance indexes on real data"
log " Step 3: Recall measured on real embeddings"
log " Step 4: 100 autonomous operations (no human)"
log " Step 5: 30 min sustained concurrent load"
log ""
log " Full log: $LOG"
log "═══════════════════════════════════════════════════════"
echo "done" > "$STATE"
;;
done)
log "Overnight proof already complete."
;;
esac