remove 7 more orphaned experimental scripts from scripts/
Continuing the test-code-in-main cleanup. These are sequential mode-runner experiment passes (2/3/4/5) that completed and whose findings were captured in pathway_memory + the matrix index — the scripts themselves are dead weight. Plus two one-off probe scripts. Removed (all 0 refs in production code or automation): - mode_pass2_corpus_sweep.ts — 2026-04 corpus sweep experiment - mode_pass3_variance.ts — variance measurement run - mode_pass4_staffing.ts — staffing-domain pass - mode_pass5_summarize.ts — summarization variance - mode_pass5_variance_paid.ts — paid-model variance - overnight_proof.sh — overnight stress probe (output in logs/) - ab_t3_test.sh — T3 overseer A/B test (output captured in KB) Verified: 0 references in package.json / justfile / Makefile / any active .ts/.rs/.sh file. Two mentions remain in docs/recon and docs/MODE_RUNNER_ TUNING_PLAN — those are historical design-doc references, not consumers. KEPT in scripts/ (have live consumers OR are runtime tools): - mode_experiment.ts (14 refs), mode_compare.ts (7 refs) - lance_smoke.sh, build_*_corpus.ts, staffing_demo.py, lance_tune.py, generate_demo.py, generate_workers.py, copilot.py, kb_measure.py, kb_staffer_report.py, analyze_chicago_contracts.ts, dump_raw_corpus.sh, check_phase44_callers.sh, autonomous_agent.py, build_answers_corpus.ts, build_lakehouse_corpus.ts, build_scrum_findings_corpus.ts, build_symbols_corpus.ts, e2e_pipeline_check.sh, scale_test.py, scale_10m_test.sh, run_staffer_demo.sh, stress_test.py Build clean. If any of these are needed back: git show HEAD~1 -- scripts/<file> Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
6aafd41785
commit
f4ebd2278b
@ -1,77 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
# A/B test of T3 overseer: does it actually make subsequent runs better?
|
||||
# Chains Run B (T3 seed) → Run C (T3 + read-back) → Run D (T3 cloud).
|
||||
# Run A is assumed already complete (launched separately). Aggregates
|
||||
# metrics at the end into ab_scorecard.json.
|
||||
|
||||
set -e
|
||||
cd "$(dirname "$0")/.."
|
||||
|
||||
export OLLAMA_CLOUD_KEY="$(python3 -c "import json; print(json.load(open('/root/llm_team_config.json'))['providers']['ollama_cloud']['api_key'])")"
|
||||
|
||||
echo "▶ A/B test start at $(date -Iseconds)"
|
||||
echo "▶ prior lessons dir: $(ls data/_playbook_lessons 2>/dev/null | wc -l) files"
|
||||
|
||||
# Run B — T3 enabled local, no prior lessons should exist yet
|
||||
echo "──── RUN B: T3 local, seeds first lesson ────"
|
||||
bun tests/multi-agent/scenario.ts > /tmp/lakehouse_ab_B.log 2>&1 || true
|
||||
echo " B exit=$?"
|
||||
ls data/_playbook_lessons/*.json 2>/dev/null | head -5
|
||||
|
||||
# Run C — T3 enabled local, B's lesson should load
|
||||
echo "──── RUN C: T3 local, reads B's lesson ────"
|
||||
bun tests/multi-agent/scenario.ts > /tmp/lakehouse_ab_C.log 2>&1 || true
|
||||
echo " C exit=$?"
|
||||
|
||||
# Run D — T3 enabled CLOUD (gpt-oss:120b), reads B+C lessons
|
||||
echo "──── RUN D: T3 cloud, reads B+C lessons ────"
|
||||
LH_OVERVIEW_CLOUD=1 bun tests/multi-agent/scenario.ts > /tmp/lakehouse_ab_D.log 2>&1 || true
|
||||
echo " D exit=$?"
|
||||
|
||||
echo "▶ all runs done at $(date -Iseconds)"
|
||||
echo "▶ scorecard:"
|
||||
ls -1dt tests/multi-agent/playbooks/scenario-* | head -4 | tac | python3 -c "
|
||||
import sys, os, json
|
||||
|
||||
runs = [l.strip() for l in sys.stdin if l.strip()]
|
||||
labels = ['A(no-T3)','B(T3-seed)','C(T3-read)','D(T3-cloud)']
|
||||
# Prepend Run A: most recent BEFORE the ab_t3_test kicked off is Run A
|
||||
# (launched separately). But we only picked up the most recent 4 runs.
|
||||
# Actually: ab_t3_test runs B/C/D, so recent 3 = B,C,D. Run A is the one
|
||||
# BEFORE those — find it separately.
|
||||
# Reread to include Run A:
|
||||
import subprocess
|
||||
all_runs = subprocess.check_output(['bash','-c','ls -1dt tests/multi-agent/playbooks/scenario-* | head -8']).decode().strip().split('\n')
|
||||
# The 4 most recent are D, C, B, A (reverse chronological).
|
||||
top4 = list(reversed(all_runs[:4])) # oldest first → A,B,C,D
|
||||
rows = []
|
||||
for i, path in enumerate(top4):
|
||||
try:
|
||||
results = json.load(open(os.path.join(path, 'results.json')))
|
||||
except FileNotFoundError:
|
||||
continue
|
||||
ok = sum(1 for r in results if r.get('ok'))
|
||||
turns = sum(r.get('turns', 0) for r in results)
|
||||
gaps = sum(len(r.get('gap_signals', [])) for r in results)
|
||||
cites = sum(len(r.get('playbook_citations') or []) for r in results)
|
||||
prior = []
|
||||
try:
|
||||
prior = json.load(open(os.path.join(path, 'prior_lessons.json')))
|
||||
except FileNotFoundError:
|
||||
pass
|
||||
rows.append({
|
||||
'label': labels[i] if i < len(labels) else f'run{i}',
|
||||
'path': path,
|
||||
'ok_events': ok,
|
||||
'total_events': len(results),
|
||||
'total_turns': turns,
|
||||
'total_gaps': gaps,
|
||||
'total_citations': cites,
|
||||
'prior_lessons_loaded': len(prior),
|
||||
})
|
||||
|
||||
scorecard = {'generated_at': __import__('datetime').datetime.utcnow().isoformat()+'Z', 'runs': rows}
|
||||
open('tests/multi-agent/playbooks/ab_scorecard.json','w').write(json.dumps(scorecard, indent=2))
|
||||
print(json.dumps(scorecard, indent=2))
|
||||
"
|
||||
echo "▶ saved: tests/multi-agent/playbooks/ab_scorecard.json"
|
||||
@ -1,121 +0,0 @@
|
||||
#!/usr/bin/env bun
|
||||
/**
|
||||
* Pass 2: matrix corpus + relevance threshold sweep.
|
||||
*
|
||||
* For each (corpus, threshold) combination, run codereview_matrix_only
|
||||
* on the same N files. Compares which corpus actually adds grounded
|
||||
* findings vs codereview_isolation (matrix-off baseline).
|
||||
*
|
||||
* Output: data/_kb/mode_experiments.jsonl gets one row per call,
|
||||
* tagged via the force_matrix_corpus + force_relevance_threshold
|
||||
* fields visible in `sources`. Aggregator can then group by corpus.
|
||||
*
|
||||
* Usage: bun run scripts/mode_pass2_corpus_sweep.ts
|
||||
*/
|
||||
|
||||
const GATEWAY = process.env.LH_GATEWAY ?? "http://localhost:3100";
|
||||
const MODEL = process.env.LH_MODEL ?? "openai/gpt-oss-120b:free";
|
||||
|
||||
const FILES = (process.env.LH_FILES ?? [
|
||||
"crates/queryd/src/delta.rs",
|
||||
"crates/queryd/src/service.rs",
|
||||
"crates/vectord/src/pathway_memory.rs",
|
||||
"crates/gateway/src/v1/mode.rs",
|
||||
"crates/aibridge/src/client.rs",
|
||||
].join(",")).split(",");
|
||||
|
||||
const CORPORA = (process.env.LH_CORPORA ?? [
|
||||
"distilled_procedural_v20260423102847",
|
||||
"distilled_factual_v20260423095819",
|
||||
"distilled_config_hint_v20260423102847",
|
||||
"kb_team_runs_v1",
|
||||
].join(",")).split(",");
|
||||
|
||||
const THRESHOLDS = (process.env.LH_THRESHOLDS ?? "0.2,0.3,0.4,0.5").split(",").map(Number);
|
||||
|
||||
interface Result {
|
||||
corpus: string;
|
||||
threshold: number;
|
||||
file: string;
|
||||
ok: boolean;
|
||||
matrix_kept?: number;
|
||||
matrix_dropped?: number;
|
||||
response_chars?: number;
|
||||
latency_ms?: number;
|
||||
error?: string;
|
||||
}
|
||||
|
||||
async function runOne(corpus: string, threshold: number, file: string): Promise<Result> {
|
||||
try {
|
||||
const r = await fetch(`${GATEWAY}/v1/mode/execute`, {
|
||||
method: "POST",
|
||||
headers: { "content-type": "application/json" },
|
||||
body: JSON.stringify({
|
||||
task_class: "scrum_review",
|
||||
file_path: file,
|
||||
force_mode: "codereview_matrix_only",
|
||||
force_model: MODEL,
|
||||
force_matrix_corpus: corpus,
|
||||
force_relevance_threshold: threshold,
|
||||
}),
|
||||
signal: AbortSignal.timeout(180_000),
|
||||
});
|
||||
if (!r.ok) {
|
||||
const body = await r.text().catch(() => "");
|
||||
return { corpus, threshold, file, ok: false, error: `HTTP ${r.status}: ${body.slice(0, 150)}` };
|
||||
}
|
||||
const j: any = await r.json();
|
||||
return {
|
||||
corpus, threshold, file, ok: true,
|
||||
matrix_kept: j.sources?.matrix_chunks_kept,
|
||||
matrix_dropped: j.sources?.matrix_chunks_dropped,
|
||||
response_chars: (j.response ?? "").length,
|
||||
latency_ms: j.latency_ms,
|
||||
};
|
||||
} catch (e: any) {
|
||||
return { corpus, threshold, file, ok: false, error: e.message };
|
||||
}
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const total = CORPORA.length * THRESHOLDS.length * FILES.length;
|
||||
console.log(`[pass2] corpora=${CORPORA.length} × thresholds=${THRESHOLDS.length} × files=${FILES.length} = ${total} runs`);
|
||||
console.log(`[pass2] model=${MODEL}\n`);
|
||||
let i = 0;
|
||||
const results: Result[] = [];
|
||||
for (const corpus of CORPORA) {
|
||||
for (const threshold of THRESHOLDS) {
|
||||
for (const file of FILES) {
|
||||
i++;
|
||||
process.stdout.write(` [${i}/${total}] corpus=${corpus.slice(0, 30).padEnd(30)} thr=${threshold.toFixed(1)} ${file.slice(-32).padStart(32)} ... `);
|
||||
const r = await runOne(corpus, threshold, file);
|
||||
results.push(r);
|
||||
if (r.ok) {
|
||||
const total_chunks = (r.matrix_kept ?? 0) + (r.matrix_dropped ?? 0);
|
||||
console.log(`✓ k=${r.matrix_kept}/${total_chunks} resp=${r.response_chars} ${((r.latency_ms ?? 0) / 1000).toFixed(1)}s`);
|
||||
} else {
|
||||
console.log(`✗ ${r.error}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
console.log(`\n[pass2] complete · ${results.filter(r => r.ok).length}/${results.length} succeeded`);
|
||||
|
||||
// Per-corpus×threshold roll-up of kept-rate (the matrix usefulness proxy).
|
||||
console.log(`\n[pass2] kept-rate by corpus × threshold (avg chunks kept per call):`);
|
||||
console.log(` ${"corpus".padEnd(40)} ${THRESHOLDS.map(t => `thr=${t.toFixed(1)}`).join(" ").padStart(35)}`);
|
||||
for (const corpus of CORPORA) {
|
||||
const cells = THRESHOLDS.map(t => {
|
||||
const matched = results.filter(r => r.ok && r.corpus === corpus && r.threshold === t);
|
||||
if (matched.length === 0) return " — ";
|
||||
const avgKept = matched.reduce((s, r) => s + (r.matrix_kept ?? 0), 0) / matched.length;
|
||||
return avgKept.toFixed(1).padStart(5);
|
||||
}).join(" ");
|
||||
console.log(` ${corpus.slice(0, 40).padEnd(40)} ${cells}`);
|
||||
}
|
||||
|
||||
console.log(`\n[pass2] aggregate findings/groundedness with: bun run scripts/mode_compare.ts`);
|
||||
}
|
||||
|
||||
main().catch(e => { console.error(e); process.exit(1); });
|
||||
@ -1,109 +0,0 @@
|
||||
#!/usr/bin/env bun
|
||||
/**
|
||||
* Pass 3: variance test.
|
||||
*
|
||||
* Runs codereview_lakehouse on the SAME file N times at each of M
|
||||
* temperatures. Measures run-to-run stability of grounded finding
|
||||
* count, response size, and latency. Anything <100% groundedness
|
||||
* is a leak; track which symbols got hallucinated.
|
||||
*
|
||||
* Output appends to data/_kb/mode_experiments.jsonl. The aggregator
|
||||
* can group by ts and identify variance buckets.
|
||||
*
|
||||
* Usage: bun run scripts/mode_pass3_variance.ts
|
||||
*/
|
||||
|
||||
const GATEWAY = process.env.LH_GATEWAY ?? "http://localhost:3100";
|
||||
const MODEL = process.env.LH_MODEL ?? "openai/gpt-oss-120b:free";
|
||||
|
||||
const FILES = (process.env.LH_FILES ?? [
|
||||
"crates/queryd/src/delta.rs",
|
||||
"crates/vectord/src/pathway_memory.rs",
|
||||
"crates/gateway/src/v1/mode.rs",
|
||||
].join(",")).split(",");
|
||||
|
||||
const TEMPS = (process.env.LH_TEMPS ?? "0.0,0.1,0.3").split(",").map(Number);
|
||||
const REPS = Number(process.env.LH_REPS ?? 5);
|
||||
|
||||
interface Result {
|
||||
file: string;
|
||||
temp: number;
|
||||
rep: number;
|
||||
ok: boolean;
|
||||
response_chars?: number;
|
||||
latency_ms?: number;
|
||||
error?: string;
|
||||
}
|
||||
|
||||
async function runOne(file: string, temp: number, rep: number): Promise<Result> {
|
||||
try {
|
||||
const r = await fetch(`${GATEWAY}/v1/mode/execute`, {
|
||||
method: "POST",
|
||||
headers: { "content-type": "application/json" },
|
||||
body: JSON.stringify({
|
||||
task_class: "scrum_review",
|
||||
file_path: file,
|
||||
force_mode: "codereview_lakehouse",
|
||||
force_model: MODEL,
|
||||
force_temperature: temp,
|
||||
}),
|
||||
signal: AbortSignal.timeout(180_000),
|
||||
});
|
||||
if (!r.ok) {
|
||||
const body = await r.text().catch(() => "");
|
||||
return { file, temp, rep, ok: false, error: `HTTP ${r.status}: ${body.slice(0, 150)}` };
|
||||
}
|
||||
const j: any = await r.json();
|
||||
return {
|
||||
file, temp, rep, ok: true,
|
||||
response_chars: (j.response ?? "").length,
|
||||
latency_ms: j.latency_ms,
|
||||
};
|
||||
} catch (e: any) {
|
||||
return { file, temp, rep, ok: false, error: e.message };
|
||||
}
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const total = FILES.length * TEMPS.length * REPS;
|
||||
console.log(`[pass3] files=${FILES.length} × temps=${TEMPS.length} × reps=${REPS} = ${total} runs`);
|
||||
console.log(`[pass3] model=${MODEL}\n`);
|
||||
let i = 0;
|
||||
const results: Result[] = [];
|
||||
for (const file of FILES) {
|
||||
for (const temp of TEMPS) {
|
||||
for (let rep = 1; rep <= REPS; rep++) {
|
||||
i++;
|
||||
process.stdout.write(` [${i}/${total}] temp=${temp.toFixed(1)} rep=${rep}/${REPS} ${file.slice(-32).padStart(32)} ... `);
|
||||
const r = await runOne(file, temp, rep);
|
||||
results.push(r);
|
||||
if (r.ok) {
|
||||
console.log(`✓ resp=${r.response_chars} ${((r.latency_ms ?? 0) / 1000).toFixed(1)}s`);
|
||||
} else {
|
||||
console.log(`✗ ${r.error}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
console.log(`\n[pass3] complete · ${results.filter(r => r.ok).length}/${results.length} succeeded`);
|
||||
|
||||
// Per-file × temp variance summary (response_chars stddev as a quick
|
||||
// proxy for output instability).
|
||||
console.log(`\n[pass3] response_chars variance (mean ± stddev) by file × temp:`);
|
||||
console.log(` ${"file".padEnd(40)} ${TEMPS.map(t => `temp=${t.toFixed(1)}`.padStart(20)).join(" ")}`);
|
||||
for (const file of FILES) {
|
||||
const cells = TEMPS.map(t => {
|
||||
const xs = results.filter(r => r.ok && r.file === file && r.temp === t).map(r => r.response_chars ?? 0);
|
||||
if (xs.length === 0) return " — ";
|
||||
const mean = xs.reduce((s, x) => s + x, 0) / xs.length;
|
||||
const sd = Math.sqrt(xs.reduce((s, x) => s + Math.pow(x - mean, 2), 0) / xs.length);
|
||||
return `${Math.round(mean).toString().padStart(7)} ± ${Math.round(sd).toString().padEnd(6)}`.padStart(20);
|
||||
}).join(" ");
|
||||
console.log(` ${file.slice(0, 40).padEnd(40)} ${cells}`);
|
||||
}
|
||||
|
||||
console.log(`\n[pass3] grounding variance via: bun run scripts/mode_compare.ts (look for grounded-N column drift)`);
|
||||
}
|
||||
|
||||
main().catch(e => { console.error(e); process.exit(1); });
|
||||
@ -1,127 +0,0 @@
|
||||
#!/usr/bin/env bun
|
||||
/**
|
||||
* Pass 4: staffing_inference_lakehouse cross-domain validation.
|
||||
*
|
||||
* Runs the staffing-domain mode against synthetic fill requests.
|
||||
* Validates that the modes-as-prompt-molders architecture generalizes
|
||||
* beyond code review — the composer pattern (file_content + bug
|
||||
* fingerprints + relevance-filtered matrix + domain framing) should
|
||||
* produce grounded staffing recommendations the same way it produces
|
||||
* grounded code reviews.
|
||||
*
|
||||
* Each fill request is posted as `file_content` (since the runner's
|
||||
* shape expects file content; for staffing it's the request payload).
|
||||
* file_path is set to a synthetic path under requests/ so pathway
|
||||
* memory bucketing groups requests by geo+role.
|
||||
*
|
||||
* Usage: bun run scripts/mode_pass4_staffing.ts
|
||||
*/
|
||||
|
||||
const GATEWAY = process.env.LH_GATEWAY ?? "http://localhost:3100";
|
||||
const MODEL = process.env.LH_MODEL ?? "openai/gpt-oss-120b:free";
|
||||
|
||||
interface FillRequest {
|
||||
city: string;
|
||||
state: string;
|
||||
role: string;
|
||||
count: number;
|
||||
deadline: string;
|
||||
notes?: string;
|
||||
}
|
||||
|
||||
const REQUESTS: FillRequest[] = [
|
||||
{ city: "Toledo", state: "OH", role: "Welder", count: 2, deadline: "2026-04-29", notes: "OSHA 10 required" },
|
||||
{ city: "Nashville", state: "TN", role: "Forklift Operator", count: 3, deadline: "2026-05-01" },
|
||||
{ city: "Chicago", state: "IL", role: "Assembler", count: 5, deadline: "2026-04-30", notes: "second shift" },
|
||||
{ city: "South Bend", state: "IN", role: "Electrician", count: 1, deadline: "2026-04-28", notes: "journeyman license" },
|
||||
{ city: "Murfreesboro", state: "TN", role: "Packaging Operator", count: 4, deadline: "2026-05-02" },
|
||||
];
|
||||
|
||||
function requestToPayload(req: FillRequest): string {
|
||||
return [
|
||||
`# Fill Request`,
|
||||
`Role: ${req.role} × ${req.count}`,
|
||||
`Location: ${req.city}, ${req.state}`,
|
||||
`Deadline: ${req.deadline}`,
|
||||
req.notes ? `Notes: ${req.notes}` : "",
|
||||
"",
|
||||
"Recommend candidates from the matrix data. Cite playbook references.",
|
||||
].filter(Boolean).join("\n");
|
||||
}
|
||||
|
||||
interface Result {
|
||||
req: FillRequest;
|
||||
ok: boolean;
|
||||
response_chars?: number;
|
||||
bug_fingerprints?: number;
|
||||
matrix_kept?: number;
|
||||
matrix_dropped?: number;
|
||||
latency_ms?: number;
|
||||
error?: string;
|
||||
preview?: string;
|
||||
}
|
||||
|
||||
async function runOne(req: FillRequest): Promise<Result> {
|
||||
const payload = requestToPayload(req);
|
||||
const file_path = `requests/${req.role.toLowerCase().replace(/\s+/g, "_")}_${req.city.toLowerCase().replace(/\s+/g, "_")}_${req.state}.md`;
|
||||
try {
|
||||
const r = await fetch(`${GATEWAY}/v1/mode/execute`, {
|
||||
method: "POST",
|
||||
headers: { "content-type": "application/json" },
|
||||
body: JSON.stringify({
|
||||
task_class: "staffing_inference",
|
||||
file_path,
|
||||
file_content: payload,
|
||||
force_mode: "staffing_inference_lakehouse",
|
||||
force_model: MODEL,
|
||||
}),
|
||||
signal: AbortSignal.timeout(180_000),
|
||||
});
|
||||
if (!r.ok) {
|
||||
const body = await r.text().catch(() => "");
|
||||
return { req, ok: false, error: `HTTP ${r.status}: ${body.slice(0, 200)}` };
|
||||
}
|
||||
const j: any = await r.json();
|
||||
return {
|
||||
req, ok: true,
|
||||
response_chars: (j.response ?? "").length,
|
||||
bug_fingerprints: j.sources?.bug_fingerprints_count,
|
||||
matrix_kept: j.sources?.matrix_chunks_kept,
|
||||
matrix_dropped: j.sources?.matrix_chunks_dropped,
|
||||
latency_ms: j.latency_ms,
|
||||
preview: (j.response ?? "").slice(0, 400),
|
||||
};
|
||||
} catch (e: any) {
|
||||
return { req, ok: false, error: e.message };
|
||||
}
|
||||
}
|
||||
|
||||
async function main() {
|
||||
console.log(`[pass4] requests=${REQUESTS.length} model=${MODEL} mode=staffing_inference_lakehouse\n`);
|
||||
let i = 0;
|
||||
const results: Result[] = [];
|
||||
for (const req of REQUESTS) {
|
||||
i++;
|
||||
process.stdout.write(` [${i}/${REQUESTS.length}] ${req.role.padEnd(22)} × ${req.count} in ${req.city}, ${req.state} ... `);
|
||||
const r = await runOne(req);
|
||||
results.push(r);
|
||||
if (r.ok) {
|
||||
console.log(`✓ resp=${r.response_chars} bug=${r.bug_fingerprints ?? 0} mtx=${r.matrix_kept ?? 0}/${(r.matrix_kept ?? 0) + (r.matrix_dropped ?? 0)} ${((r.latency_ms ?? 0) / 1000).toFixed(1)}s`);
|
||||
} else {
|
||||
console.log(`✗ ${r.error}`);
|
||||
}
|
||||
}
|
||||
|
||||
console.log(`\n[pass4] complete · ${results.filter(r => r.ok).length}/${results.length} succeeded\n`);
|
||||
|
||||
// Show first successful response head to verify the framing actually
|
||||
// produced staffing-style output (verdict + ranked candidates) not
|
||||
// generic prose.
|
||||
const first = results.find(r => r.ok && r.preview);
|
||||
if (first) {
|
||||
console.log(`[pass4] first successful response preview (${first.req.city} ${first.req.role}):`);
|
||||
console.log(first.preview!.split("\n").map(l => " | " + l).join("\n"));
|
||||
}
|
||||
}
|
||||
|
||||
main().catch(e => { console.error(e); process.exit(1); });
|
||||
@ -1,169 +0,0 @@
|
||||
#!/usr/bin/env bun
|
||||
/**
|
||||
* Pass 5 variance summarizer. Reads data/_kb/mode_experiments.jsonl
|
||||
* since a timestamp, groups by (mode|corpus), reports mean ± stddev
|
||||
* of grounded finding count, plus a head-to-head wins/losses table
|
||||
* vs the isolation baseline.
|
||||
*
|
||||
* Usage:
|
||||
* bun run scripts/mode_pass5_summarize.ts # default 2h
|
||||
* bun run scripts/mode_pass5_summarize.ts --since 2026-04-26T22 # explicit
|
||||
*/
|
||||
|
||||
import { readFileSync, existsSync } from "node:fs";
|
||||
|
||||
const argSince = (() => {
|
||||
const i = Bun.argv.indexOf("--since");
|
||||
return i >= 0 ? Bun.argv[i + 1] : new Date(Date.now() - 2 * 60 * 60 * 1000).toISOString();
|
||||
})();
|
||||
|
||||
const JSONL = "data/_kb/mode_experiments.jsonl";
|
||||
if (!existsSync(JSONL)) { console.error(`no ${JSONL}`); process.exit(1); }
|
||||
|
||||
interface Row {
|
||||
ts: string; mode: string; file_path: string; response: string;
|
||||
sources: { matrix_corpus?: string | string[] | null };
|
||||
latency_ms: number;
|
||||
}
|
||||
|
||||
function corpusKey(c: any): string {
|
||||
if (!c) return "";
|
||||
if (typeof c === "string") return c;
|
||||
if (Array.isArray(c)) return c.length === 0 ? "" : [...c].sort().join("+");
|
||||
return "";
|
||||
}
|
||||
const condKey = (r: Row) => {
|
||||
const c = corpusKey(r.sources?.matrix_corpus);
|
||||
return c ? `${r.mode}|${c}` : r.mode;
|
||||
};
|
||||
|
||||
// Reuse the same grounding logic as mode_compare — symbols cited in
|
||||
// findings rows must appear in the focus file, and any line numbers
|
||||
// must fall within EOF.
|
||||
function extractFindings(md: string): { symbols: string[]; lines: number[] }[] {
|
||||
const sec = /(?:^|\n)#{1,3}[^\na-zA-Z]*(?:Ranked\s+)?Findings?[^\n]*\n/i;
|
||||
const m = md.match(sec);
|
||||
let section = md;
|
||||
if (m && m.index !== undefined) {
|
||||
const after = md.slice(m.index + m[0].length);
|
||||
const stop = after.search(/\n#{1,3}[^\na-zA-Z]*(?:Patch|Suggestion|Reference|Summary|Concrete)/i);
|
||||
section = stop >= 0 ? after.slice(0, stop) : after;
|
||||
}
|
||||
// Three row shapes:
|
||||
// 1) numbered: `| 1 | ... |`
|
||||
// 2) path-with-line: `| service.rs:106 | ... |`
|
||||
// 3) path-with-sym: `| crates/vectord/src/pathway_memory.rs:load_fn (≈L220) | ... |`
|
||||
// Pick whichever shape matches the most rows (ties favor numbered).
|
||||
const numbered = section.split("\n").filter(l => /^\|\s*\*?\*?\d+\*?\*?\s*\|/.test(l));
|
||||
const pathRows = section.split("\n").filter(l => /^\|\s*[a-z_/\.][a-z_/\.0-9]*\.(rs|ts|py)\b/i.test(l));
|
||||
const rows = numbered.length >= pathRows.length ? numbered : pathRows;
|
||||
return rows.map(row => {
|
||||
const sym = new Set<string>();
|
||||
for (const t of row.matchAll(/`([A-Za-z_][A-Za-z0-9_:]*)`/g)) sym.add(t[1]);
|
||||
for (const t of row.matchAll(/\b([a-z][a-z0-9_]{4,})\b/g)) sym.add(t[1]);
|
||||
const lines: number[] = [];
|
||||
for (const t of row.matchAll(/[:\-](\d{2,5})/g)) lines.push(parseInt(t[1]));
|
||||
return { symbols: [...sym], lines };
|
||||
});
|
||||
}
|
||||
|
||||
function grounded(md: string, file: string): { total: number; grounded: number; oob: number } {
|
||||
const content = readFileSync(file, "utf8");
|
||||
const eof = content.split("\n").length;
|
||||
const findings = extractFindings(md);
|
||||
let g = 0, oob = 0;
|
||||
for (const f of findings) {
|
||||
const symHit = f.symbols.length > 0 && f.symbols.some(s => content.includes(s));
|
||||
const lineOob = f.lines.length > 0 && f.lines.some(l => l > eof);
|
||||
if (lineOob) oob++;
|
||||
if (symHit && !lineOob) g++;
|
||||
}
|
||||
return { total: findings.length, grounded: g, oob };
|
||||
}
|
||||
|
||||
const lines = readFileSync(JSONL, "utf8").split("\n").filter(Boolean);
|
||||
const rows: Row[] = [];
|
||||
for (const l of lines) {
|
||||
try {
|
||||
const r: Row = JSON.parse(l);
|
||||
if (r.ts < argSince) continue;
|
||||
rows.push(r);
|
||||
} catch {}
|
||||
}
|
||||
|
||||
if (rows.length === 0) { console.error(`no rows since ${argSince}`); process.exit(1); }
|
||||
|
||||
// Group: condition → file → array of grounded counts
|
||||
type CellArr = { grnd: number[]; total: number[]; oob: number[]; ms: number[] };
|
||||
const byCond: Record<string, Record<string, CellArr>> = {};
|
||||
for (const r of rows) {
|
||||
const k = condKey(r);
|
||||
byCond[k] ??= {};
|
||||
byCond[k][r.file_path] ??= { grnd: [], total: [], oob: [], ms: [] };
|
||||
const g = grounded(r.response, r.file_path);
|
||||
byCond[k][r.file_path].grnd.push(g.grounded);
|
||||
byCond[k][r.file_path].total.push(g.total);
|
||||
byCond[k][r.file_path].oob.push(g.oob);
|
||||
byCond[k][r.file_path].ms.push(r.latency_ms);
|
||||
}
|
||||
|
||||
function stats(xs: number[]): { n: number; mean: number; sd: number; min: number; max: number } {
|
||||
const n = xs.length;
|
||||
if (n === 0) return { n: 0, mean: 0, sd: 0, min: 0, max: 0 };
|
||||
const mean = xs.reduce((s, x) => s + x, 0) / n;
|
||||
const variance = n === 1 ? 0 : xs.reduce((s, x) => s + (x - mean) ** 2, 0) / (n - 1);
|
||||
return { n, mean, sd: Math.sqrt(variance), min: Math.min(...xs), max: Math.max(...xs) };
|
||||
}
|
||||
|
||||
const conditions = Object.keys(byCond).sort();
|
||||
const files = [...new Set(rows.map(r => r.file_path))].sort();
|
||||
|
||||
console.log(`\n═══ Pass 5 variance — since ${argSince} ═══\n`);
|
||||
console.log(` ${rows.length} rows · ${conditions.length} conditions · ${files.length} files\n`);
|
||||
|
||||
for (const file of files) {
|
||||
console.log(`📄 ${file}`);
|
||||
console.log(` ${"condition".padEnd(56)} n ${"grounded mean ± sd".padStart(20)} ${"range".padStart(8)} ${"oob".padStart(4)} ${"avg ms".padStart(7)}`);
|
||||
console.log(` ${"─".repeat(56)} ─── ${"─".repeat(20)} ${"─".repeat(8)} ${"─".repeat(4)} ${"─".repeat(7)}`);
|
||||
for (const c of conditions) {
|
||||
const cell = byCond[c]?.[file];
|
||||
if (!cell || cell.grnd.length === 0) continue;
|
||||
const s = stats(cell.grnd);
|
||||
const oobSum = cell.oob.reduce((a, b) => a + b, 0);
|
||||
const msMean = cell.ms.reduce((a, b) => a + b, 0) / cell.ms.length;
|
||||
const meanSd = `${s.mean.toFixed(1)} ± ${s.sd.toFixed(1)}`;
|
||||
const range = `[${s.min}-${s.max}]`;
|
||||
console.log(` ${c.padEnd(56)} ${String(s.n).padStart(3)} ${meanSd.padStart(20)} ${range.padStart(8)} ${String(oobSum).padStart(4)} ${Math.round(msMean / 1000).toString().padStart(5)}s`);
|
||||
}
|
||||
console.log("");
|
||||
}
|
||||
|
||||
// Head-to-head: for each condition vs isolation baseline, count rep-by-rep
|
||||
// wins across the same file. Requires equal rep counts.
|
||||
console.log(`═══ Head-to-head: each condition vs isolation, rep-by-rep ═══\n`);
|
||||
const isoKey = conditions.find(c => c.startsWith("codereview_isolation"));
|
||||
if (!isoKey) {
|
||||
console.log(" no isolation rows in window");
|
||||
} else {
|
||||
console.log(` baseline: ${isoKey}\n`);
|
||||
console.log(` ${"challenger".padEnd(56)} wins losses ties Δ mean grnd`);
|
||||
console.log(` ${"─".repeat(56)} ${"─".repeat(4)} ${"─".repeat(6)} ${"─".repeat(4)} ${"─".repeat(12)}`);
|
||||
for (const c of conditions) {
|
||||
if (c === isoKey) continue;
|
||||
let wins = 0, losses = 0, ties = 0, deltaSum = 0, n = 0;
|
||||
for (const file of files) {
|
||||
const isoArr = byCond[isoKey]?.[file]?.grnd ?? [];
|
||||
const cArr = byCond[c]?.[file]?.grnd ?? [];
|
||||
const k = Math.min(isoArr.length, cArr.length);
|
||||
for (let i = 0; i < k; i++) {
|
||||
if (cArr[i] > isoArr[i]) wins++;
|
||||
else if (cArr[i] < isoArr[i]) losses++;
|
||||
else ties++;
|
||||
deltaSum += cArr[i] - isoArr[i];
|
||||
n++;
|
||||
}
|
||||
}
|
||||
const dMean = n > 0 ? (deltaSum / n).toFixed(2) : "—";
|
||||
console.log(` ${c.padEnd(56)} ${String(wins).padStart(4)} ${String(losses).padStart(6)} ${String(ties).padStart(4)} ${dMean.padStart(12)}`);
|
||||
}
|
||||
}
|
||||
@ -1,105 +0,0 @@
|
||||
#!/usr/bin/env bun
|
||||
/**
|
||||
* Pass 5: variance test for the 2026-04-26 paid-model bake-off.
|
||||
*
|
||||
* The pass-4 single-rep sweep showed isolation beating every matrix
|
||||
* condition by 1.0-1.4 grounded findings/file on grok-4.1-fast. This
|
||||
* harness runs N reps × M conditions on the file where the effect was
|
||||
* sharpest (pathway_memory.rs, 1355 lines) so we can decide whether
|
||||
* the deltas are real signal or run-to-run noise.
|
||||
*
|
||||
* Conditions:
|
||||
* 1. codereview_isolation — no matrix
|
||||
* 2. codereview_lakehouse + corpus=lakehouse_arch_v1 — A only
|
||||
* 3. codereview_lakehouse + corpus=lakehouse_symbols_v1 — C only
|
||||
* 4. codereview_lakehouse (modes.toml default) — A+C composed
|
||||
*
|
||||
* Output appends per-call to data/_kb/mode_experiments.jsonl. Aggregate
|
||||
* with `bun run scripts/mode_compare.ts --since <ts>` and read the
|
||||
* grounded column with multiple rows per (mode|corpus) key.
|
||||
*
|
||||
* Usage:
|
||||
* bun run scripts/mode_pass5_variance_paid.ts
|
||||
* LH_REPS=3 LH_FILE=crates/queryd/src/delta.rs bun run scripts/mode_pass5_variance_paid.ts
|
||||
*/
|
||||
|
||||
const GATEWAY = process.env.LH_GATEWAY ?? "http://localhost:3100";
|
||||
const MODEL = process.env.LH_MODEL ?? "x-ai/grok-4.1-fast";
|
||||
const FILE = process.env.LH_FILE ?? "crates/vectord/src/pathway_memory.rs";
|
||||
const REPS = Number(process.env.LH_REPS ?? 5);
|
||||
|
||||
interface Condition {
|
||||
label: string;
|
||||
mode: string;
|
||||
corpus?: string | string[];
|
||||
}
|
||||
|
||||
const ALL_CONDITIONS: Condition[] = [
|
||||
{ label: "isolation ", mode: "codereview_isolation" },
|
||||
{ label: "arch_only ", mode: "codereview_lakehouse", corpus: "lakehouse_arch_v1" },
|
||||
{ label: "symbols_only ", mode: "codereview_lakehouse", corpus: "lakehouse_symbols_v1" },
|
||||
{ label: "composed (A+C) ", mode: "codereview_lakehouse" /* uses modes.toml default */ },
|
||||
];
|
||||
|
||||
// Optional whitelist via env: LH_CONDITIONS=isolation,composed limits the
|
||||
// run to a subset (matches against the trimmed `label`). Useful when only
|
||||
// the head-to-head pair matters and saves ~50% latency on slow rungs.
|
||||
const wantedLabels = (process.env.LH_CONDITIONS ?? "")
|
||||
.split(",").map(s => s.trim().toLowerCase()).filter(Boolean);
|
||||
const CONDITIONS: Condition[] = wantedLabels.length === 0
|
||||
? ALL_CONDITIONS
|
||||
: ALL_CONDITIONS.filter(c => wantedLabels.some(w => c.label.trim().toLowerCase().startsWith(w)));
|
||||
|
||||
async function runOne(c: Condition, rep: number): Promise<{ ok: boolean; latency_ms?: number; resp_chars?: number; error?: string }> {
|
||||
const body: any = {
|
||||
task_class: "scrum_review",
|
||||
file_path: FILE,
|
||||
force_mode: c.mode,
|
||||
force_model: MODEL,
|
||||
};
|
||||
if (c.corpus !== undefined) body.force_matrix_corpus = c.corpus;
|
||||
|
||||
try {
|
||||
const r = await fetch(`${GATEWAY}/v1/mode/execute`, {
|
||||
method: "POST",
|
||||
headers: { "content-type": "application/json" },
|
||||
body: JSON.stringify(body),
|
||||
signal: AbortSignal.timeout(240_000),
|
||||
});
|
||||
if (!r.ok) {
|
||||
const txt = await r.text().catch(() => "");
|
||||
return { ok: false, error: `HTTP ${r.status}: ${txt.slice(0, 160)}` };
|
||||
}
|
||||
const j: any = await r.json();
|
||||
return { ok: true, latency_ms: j.latency_ms, resp_chars: (j.response ?? "").length };
|
||||
} catch (e: any) {
|
||||
return { ok: false, error: e.message };
|
||||
}
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const total = CONDITIONS.length * REPS;
|
||||
console.log(`[pass5] file=${FILE}`);
|
||||
console.log(`[pass5] model=${MODEL} · ${CONDITIONS.length} conditions × ${REPS} reps = ${total} runs`);
|
||||
console.log("");
|
||||
|
||||
let i = 0;
|
||||
const startTs = new Date().toISOString();
|
||||
for (let rep = 1; rep <= REPS; rep++) {
|
||||
for (const c of CONDITIONS) {
|
||||
i++;
|
||||
process.stdout.write(` [${i}/${total}] rep=${rep} ${c.label}... `);
|
||||
const r = await runOne(c, rep);
|
||||
if (r.ok) {
|
||||
console.log(`✓ ${r.resp_chars} chars · ${((r.latency_ms ?? 0) / 1000).toFixed(1)}s`);
|
||||
} else {
|
||||
console.log(`✗ ${r.error}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
console.log(`\n[pass5] complete · started ${startTs}`);
|
||||
console.log(`[pass5] aggregate: bun run scripts/mode_compare.ts --since ${startTs}`);
|
||||
}
|
||||
|
||||
main().catch(e => { console.error(e); process.exit(1); });
|
||||
@ -1,458 +0,0 @@
|
||||
#!/bin/bash
|
||||
# OVERNIGHT PROOF — the test that settles it
|
||||
# Runs unattended: embed 500K, build indexes, measure recall,
|
||||
# autonomous agent test, sustained load. ~3 hours total.
|
||||
#
|
||||
# Monitor: tail -f /home/profit/lakehouse/logs/overnight_proof.log
|
||||
|
||||
set -uo pipefail
|
||||
|
||||
LOG="/home/profit/lakehouse/logs/overnight_proof.log"
|
||||
STATE="/tmp/overnight_proof_state"
|
||||
LOCK="/tmp/overnight_proof.lock"
|
||||
LH="http://localhost:3100"
|
||||
GW="http://localhost:3700"
|
||||
|
||||
mkdir -p /home/profit/lakehouse/logs
|
||||
|
||||
if [ -f "$LOCK" ] && kill -0 "$(cat $LOCK)" 2>/dev/null; then
|
||||
echo "$(date) Already running" >> "$LOG"
|
||||
exit 0
|
||||
fi
|
||||
echo $$ > "$LOCK"
|
||||
trap "rm -f $LOCK" EXIT
|
||||
|
||||
log() { echo "$(date '+%H:%M:%S') $1" | tee -a "$LOG"; }
|
||||
|
||||
touch "$STATE"
|
||||
step=$(cat "$STATE" 2>/dev/null || echo "embed")
|
||||
|
||||
log "═══ OVERNIGHT PROOF: step=$step ═══"
|
||||
|
||||
case "$step" in
|
||||
|
||||
embed)
|
||||
log "STEP 1/5: Embedding 500K workers through Ollama (~40 min)"
|
||||
log " This is the real test — actual nomic-embed-text embeddings, not random vectors"
|
||||
|
||||
python3 << 'PYEOF' >> "$LOG" 2>&1
|
||||
import json, time, sys
|
||||
from urllib.request import Request, urlopen
|
||||
from urllib.error import HTTPError
|
||||
|
||||
LH = "http://localhost:3100"
|
||||
|
||||
def post(path, body, timeout=300):
|
||||
r = Request(f"{LH}{path}", json.dumps(body).encode(), headers={"Content-Type": "application/json"})
|
||||
try: return json.loads(urlopen(r, timeout=timeout).read())
|
||||
except HTTPError as e: return {"error": e.read().decode()[:200]}
|
||||
except Exception as e: return {"error": str(e)}
|
||||
|
||||
# Fetch 500K resume_text for embedding
|
||||
print("Fetching resume texts from workers_500k...")
|
||||
r = post("/query/sql", {"sql": "SELECT worker_id, resume_text FROM workers_500k LIMIT 500000"})
|
||||
if "error" in r:
|
||||
print(f"SQL error: {r['error']}")
|
||||
sys.exit(1)
|
||||
|
||||
rows = r.get("rows", [])
|
||||
print(f"Got {len(rows)} rows")
|
||||
|
||||
# Build docs for embedding
|
||||
docs = []
|
||||
for row in rows:
|
||||
wid = row.get("worker_id", "")
|
||||
text = row.get("resume_text", "")
|
||||
if text and len(text) > 20:
|
||||
docs.append({"id": f"W500K-{wid}", "text": text})
|
||||
|
||||
print(f"{len(docs)} docs ready for embedding")
|
||||
|
||||
# Chunk into batches of 50K to avoid timeout issues
|
||||
BATCH = 50000
|
||||
for batch_start in range(0, len(docs), BATCH):
|
||||
batch = docs[batch_start:batch_start + BATCH]
|
||||
batch_num = batch_start // BATCH + 1
|
||||
total_batches = (len(docs) + BATCH - 1) // BATCH
|
||||
idx_name = f"workers_500k_v{batch_num}"
|
||||
|
||||
print(f"\nBatch {batch_num}/{total_batches}: {len(batch)} docs → index '{idx_name}'")
|
||||
t0 = time.time()
|
||||
|
||||
r = post("/vectors/index", {
|
||||
"index_name": idx_name,
|
||||
"source": "workers_500k",
|
||||
"documents": batch,
|
||||
"chunk_size": 500,
|
||||
"overlap": 50,
|
||||
}, timeout=600)
|
||||
|
||||
if "error" in r:
|
||||
print(f" Index creation error: {r['error']}")
|
||||
continue
|
||||
|
||||
job_id = r.get("job_id")
|
||||
chunks = r.get("chunks", 0)
|
||||
print(f" Job {job_id}: {chunks} chunks, embedding in background...")
|
||||
|
||||
# Wait for this batch to complete
|
||||
for _ in range(600): # 50 min max per batch
|
||||
time.sleep(5)
|
||||
status = post(f"/vectors/jobs/{job_id}", None) if job_id else {"status": "unknown"}
|
||||
if isinstance(status, dict):
|
||||
state = status.get("status", "unknown")
|
||||
progress = status.get("embedded_chunks", 0)
|
||||
if state == "completed":
|
||||
elapsed = time.time() - t0
|
||||
rate = chunks / elapsed if elapsed > 0 else 0
|
||||
print(f" DONE: {chunks} chunks in {elapsed:.0f}s ({rate:.0f}/sec)")
|
||||
break
|
||||
elif state == "failed":
|
||||
print(f" FAILED: {status.get('error', 'unknown')}")
|
||||
break
|
||||
sys.stdout.write(f"\r {state}: {progress}/{chunks} chunks...")
|
||||
sys.stdout.flush()
|
||||
print()
|
||||
|
||||
print("\nAll batches submitted. Checking indexes...")
|
||||
r = post("/vectors/indexes", None)
|
||||
if not isinstance(r, list): r = []
|
||||
for idx in r:
|
||||
if "500k" in idx.get("index_name", ""):
|
||||
print(f" {idx['index_name']}: {idx['chunk_count']} chunks")
|
||||
|
||||
print("STEP 1 COMPLETE")
|
||||
PYEOF
|
||||
|
||||
if grep -q "STEP 1 COMPLETE" "$LOG"; then
|
||||
echo "build_indexes" > "$STATE"
|
||||
log "Embedding complete — moving to index build"
|
||||
else
|
||||
log "Embedding may still be running — will check on next heartbeat"
|
||||
echo "check_embed" > "$STATE"
|
||||
fi
|
||||
;;
|
||||
|
||||
check_embed)
|
||||
log "Checking embedding job status..."
|
||||
python3 -c "
|
||||
import json
|
||||
from urllib.request import urlopen
|
||||
r = json.loads(urlopen('http://localhost:3100/vectors/jobs', timeout=30).read())
|
||||
running = [j for j in r if j.get('status') == 'running']
|
||||
completed = [j for j in r if j.get('status') == 'completed' and '500k' in j.get('index_name','')]
|
||||
print(f'Running: {len(running)}, Completed 500K: {len(completed)}')
|
||||
if not running:
|
||||
print('ALL_DONE')
|
||||
" >> "$LOG" 2>&1
|
||||
|
||||
if grep -q "ALL_DONE" "$LOG"; then
|
||||
echo "build_indexes" > "$STATE"
|
||||
fi
|
||||
;;
|
||||
|
||||
build_indexes)
|
||||
log "STEP 2/5: Building HNSW + Lance on real 500K embeddings"
|
||||
|
||||
python3 << 'PYEOF' >> "$LOG" 2>&1
|
||||
import json, time
|
||||
from urllib.request import Request, urlopen
|
||||
|
||||
LH = "http://localhost:3100"
|
||||
def post(path, body, timeout=600):
|
||||
r = Request(f"{LH}{path}", json.dumps(body).encode(), headers={"Content-Type": "application/json"})
|
||||
return json.loads(urlopen(r, timeout=timeout).read())
|
||||
|
||||
# Find the first 500K index
|
||||
indexes = json.loads(urlopen(f"{LH}/vectors/indexes", timeout=30).read())
|
||||
idx_500k = [i for i in indexes if "500k" in i.get("index_name","")]
|
||||
if not idx_500k:
|
||||
print("No 500K index found — embedding may not be complete")
|
||||
exit(1)
|
||||
|
||||
idx_name = idx_500k[0]["index_name"]
|
||||
chunks = idx_500k[0]["chunk_count"]
|
||||
print(f"Using index: {idx_name} ({chunks} chunks)")
|
||||
|
||||
# Build HNSW
|
||||
print(f"Building HNSW on {chunks} real embeddings...")
|
||||
t0 = time.time()
|
||||
r = post("/vectors/hnsw/build", {"index_name": idx_name})
|
||||
print(f" HNSW: {r.get('vectors',0)} vectors in {time.time()-t0:.0f}s")
|
||||
|
||||
# Migrate to Lance
|
||||
print(f"Migrating to Lance...")
|
||||
t0 = time.time()
|
||||
r = post(f"/vectors/lance/migrate/{idx_name}", {})
|
||||
stats = r.get("stats", {})
|
||||
print(f" Lance: {stats.get('rows_written',0)} rows in {stats.get('duration_secs',0):.1f}s")
|
||||
|
||||
# Build IVF_PQ on Lance
|
||||
# sqrt(50K) ≈ 224 partitions for a 50K batch
|
||||
print(f"Building IVF_PQ on Lance...")
|
||||
t0 = time.time()
|
||||
r = post(f"/vectors/lance/index/{idx_name}", {"num_partitions": 224, "num_bits": 8, "num_sub_vectors": 192})
|
||||
print(f" IVF_PQ: built in {r.get('build_time_secs',0):.0f}s")
|
||||
|
||||
# Build scalar btree
|
||||
print(f"Building scalar btree on doc_id...")
|
||||
r = post(f"/vectors/lance/scalar-index/{idx_name}/doc_id", {})
|
||||
print(f" Btree: built in {r.get('build_time_secs',0):.1f}s")
|
||||
|
||||
print("STEP 2 COMPLETE")
|
||||
PYEOF
|
||||
|
||||
if grep -q "STEP 2 COMPLETE" "$LOG"; then
|
||||
echo "recall_test" > "$STATE"
|
||||
fi
|
||||
;;
|
||||
|
||||
recall_test)
|
||||
log "STEP 3/5: Measuring recall on REAL embeddings"
|
||||
|
||||
python3 << 'PYEOF' >> "$LOG" 2>&1
|
||||
import json, time
|
||||
from urllib.request import Request, urlopen
|
||||
|
||||
LH = "http://localhost:3100"
|
||||
def post(path, body, timeout=300):
|
||||
r = Request(f"{LH}{path}", json.dumps(body).encode(), headers={"Content-Type": "application/json"})
|
||||
return json.loads(urlopen(r, timeout=timeout).read())
|
||||
|
||||
# Find 500K index
|
||||
indexes = json.loads(urlopen(f"{LH}/vectors/indexes", timeout=30).read())
|
||||
idx_500k = [i for i in indexes if "500k" in i.get("index_name","")]
|
||||
if not idx_500k:
|
||||
print("No 500K index — skipping recall")
|
||||
exit(0)
|
||||
idx_name = idx_500k[0]["index_name"]
|
||||
|
||||
# Auto-generate eval harness
|
||||
print(f"Generating eval harness for {idx_name}...")
|
||||
r = post(f"/vectors/hnsw/evals/{idx_name}_recall/autogen", {
|
||||
"index_name": idx_name, "sample_count": 50, "k": 10,
|
||||
})
|
||||
print(f" Harness: {len(r.get('queries',[]))} queries, k={r.get('k',10)}")
|
||||
|
||||
# HNSW recall
|
||||
print("Measuring HNSW recall...")
|
||||
r = post("/vectors/hnsw/trial", {
|
||||
"index_name": idx_name,
|
||||
"harness": f"{idx_name}_recall",
|
||||
"config": {"ef_construction": 80, "ef_search": 30, "seed": 42},
|
||||
})
|
||||
print(f" HNSW recall@10: {r.get('metrics',{}).get('recall_at_k',0):.4f}")
|
||||
print(f" HNSW p50: {r.get('metrics',{}).get('search_latency_p50_us',0):.0f}us")
|
||||
|
||||
# Lance recall
|
||||
print("Measuring Lance IVF_PQ recall...")
|
||||
r = post(f"/vectors/lance/recall/{idx_name}", {
|
||||
"harness": f"{idx_name}_recall", "top_k": 10,
|
||||
})
|
||||
print(f" Lance recall@10: {r.get('mean_recall',0):.4f}")
|
||||
print(f" Lance p50: {r.get('latency_p50_us',0):.0f}us")
|
||||
|
||||
print("STEP 3 COMPLETE")
|
||||
PYEOF
|
||||
|
||||
if grep -q "STEP 3 COMPLETE" "$LOG"; then
|
||||
echo "autonomous_test" > "$STATE"
|
||||
fi
|
||||
;;
|
||||
|
||||
autonomous_test)
|
||||
log "STEP 4/5: 100 staffing questions — LOCAL MODEL ONLY, no human steering"
|
||||
|
||||
python3 << 'PYEOF' >> "$LOG" 2>&1
|
||||
import json, time, random
|
||||
from urllib.request import Request, urlopen
|
||||
from urllib.error import HTTPError
|
||||
|
||||
GW = "http://localhost:3700"
|
||||
LH = "http://localhost:3100"
|
||||
random.seed(2026)
|
||||
|
||||
def gw(path, body=None, timeout=180):
|
||||
data = json.dumps(body).encode() if body else None
|
||||
method = "POST" if body else "GET"
|
||||
r = Request(f"{GW}{path}", data=data, method=method, headers={"Content-Type":"application/json"} if body else {})
|
||||
try: return json.loads(urlopen(r, timeout=timeout).read())
|
||||
except HTTPError as e: return {"error": e.read().decode()[:200]}
|
||||
except Exception as e: return {"error": str(e)}
|
||||
|
||||
def sql(query):
|
||||
r = gw("/sql", {"sql": query})
|
||||
return r.get("rows", []) if "error" not in r else []
|
||||
|
||||
ROLES = ["Forklift Operator","Machine Operator","Assembler","Loader","Quality Tech",
|
||||
"Welder","Sanitation Worker","Shipping Clerk","Production Worker","Maintenance Tech"]
|
||||
STATES = ["IL","IN","OH","MO","TN","KY","WI","MI"]
|
||||
|
||||
print("═══ 100 AUTONOMOUS OPERATIONS ═══")
|
||||
passed = 0
|
||||
failed = 0
|
||||
total_ms = 0
|
||||
|
||||
# Mix of operation types
|
||||
for i in range(100):
|
||||
op_type = random.choices(["match","count","aggregate","lookup"], weights=[50,25,15,10])[0]
|
||||
role = random.choice(ROLES)
|
||||
state = random.choice(STATES)
|
||||
rel = round(random.uniform(0.6, 0.9), 2)
|
||||
|
||||
t0 = time.time()
|
||||
ok = False
|
||||
detail = ""
|
||||
|
||||
if op_type == "match":
|
||||
r = gw("/search", {
|
||||
"question": f"Find {role} workers in {state}",
|
||||
"sql_filter": f"role = '{role}' AND state = '{state}' AND CAST(reliability AS DOUBLE) >= {rel}",
|
||||
"dataset": "workers_500k", "top_k": 5, "generate": False,
|
||||
})
|
||||
matched = len(r.get("sources", []))
|
||||
ok = matched > 0 or r.get("sql_matches", 0) == 0 # 0 matches is ok if SQL found 0
|
||||
detail = f"match: {matched} results (sql={r.get('sql_matches',0)})"
|
||||
|
||||
elif op_type == "count":
|
||||
truth = sql(f"SELECT COUNT(*) cnt FROM workers_500k WHERE role = '{role}' AND state = '{state}'")
|
||||
expected = truth[0]["cnt"] if truth else 0
|
||||
# Use keyword classifier logic: count → SQL
|
||||
r = gw("/sql", {"sql": f"SELECT COUNT(*) cnt FROM workers_500k WHERE role = '{role}' AND state = '{state}'"})
|
||||
got = r.get("rows", [{}])[0].get("cnt", -1) if "error" not in r else -1
|
||||
ok = got == expected
|
||||
detail = f"count: got={got} expected={expected}"
|
||||
|
||||
elif op_type == "aggregate":
|
||||
r = gw("/sql", {"sql": f"SELECT ROUND(AVG(CAST(reliability AS DOUBLE)),3) avg FROM workers_500k WHERE role = '{role}' AND state = '{state}'"})
|
||||
ok = "error" not in r and r.get("rows")
|
||||
detail = f"aggregate: {r.get('rows',[{}])[0] if ok else r.get('error','?')[:40]}"
|
||||
|
||||
elif op_type == "lookup":
|
||||
wid = random.randint(1, 500000)
|
||||
r = gw(f"/worker/{wid}")
|
||||
ok = r.get("rows") and len(r["rows"]) > 0
|
||||
detail = f"lookup: worker {wid} {'found' if ok else 'not found'}"
|
||||
|
||||
ms = (time.time()-t0)*1000
|
||||
total_ms += ms
|
||||
if ok: passed += 1
|
||||
else: failed += 1
|
||||
|
||||
if i % 20 == 0 or not ok:
|
||||
icon = "OK" if ok else "FAIL"
|
||||
print(f" [{i+1:3d}/100] {icon} {op_type:10s} {detail[:50]:50s} ({ms:.0f}ms)")
|
||||
|
||||
pct = passed / 100 * 100
|
||||
print(f"\n═══ RESULT: {passed}/100 passed ({pct:.0f}%) in {total_ms/1000:.1f}s ═══")
|
||||
print(f" avg latency: {total_ms/100:.0f}ms per operation")
|
||||
|
||||
# Log to playbook
|
||||
gw("/log", {
|
||||
"operation": f"autonomous_100: {passed}/100 ({pct:.0f}%)",
|
||||
"approach": "keyword routing + SQL + hybrid, local model only",
|
||||
"result": f"passed={passed} failed={failed} avg_ms={total_ms/100:.0f}",
|
||||
"context": "overnight proof step 4",
|
||||
})
|
||||
|
||||
if pct >= 90:
|
||||
print("STEP 4 COMPLETE — AUTONOMOUS TEST PASSED")
|
||||
else:
|
||||
print(f"STEP 4 COMPLETE — {pct:.0f}% (below 90% target)")
|
||||
PYEOF
|
||||
|
||||
echo "sustained_load" > "$STATE"
|
||||
;;
|
||||
|
||||
sustained_load)
|
||||
log "STEP 5/5: Sustained load — 30 minutes of continuous operations"
|
||||
|
||||
python3 << 'PYEOF' >> "$LOG" 2>&1
|
||||
import json, time, random, concurrent.futures
|
||||
from urllib.request import Request, urlopen
|
||||
from urllib.error import HTTPError
|
||||
|
||||
GW = "http://localhost:3700"
|
||||
random.seed(42)
|
||||
|
||||
def gw(path, body=None):
|
||||
data = json.dumps(body).encode() if body else None
|
||||
r = Request(f"{GW}{path}", data=data, method="POST" if body else "GET",
|
||||
headers={"Content-Type":"application/json"} if body else {})
|
||||
try: return json.loads(urlopen(r, timeout=60).read())
|
||||
except: return {"error": "timeout"}
|
||||
|
||||
ROLES = ["Forklift Operator","Machine Operator","Assembler","Loader","Quality Tech"]
|
||||
STATES = ["IL","IN","OH","MO"]
|
||||
|
||||
print("═══ SUSTAINED LOAD: 30 minutes ═══")
|
||||
duration = 30 * 60 # 30 minutes
|
||||
t_start = time.time()
|
||||
ops = 0
|
||||
errors = 0
|
||||
cycle = 0
|
||||
|
||||
while time.time() - t_start < duration:
|
||||
cycle += 1
|
||||
batch_ops = 0
|
||||
batch_errors = 0
|
||||
|
||||
# Fire 10 concurrent operations
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as pool:
|
||||
futures = []
|
||||
for _ in range(10):
|
||||
role = random.choice(ROLES)
|
||||
state = random.choice(STATES)
|
||||
futures.append(pool.submit(gw, "/sql", {
|
||||
"sql": f"SELECT COUNT(*) FROM workers_500k WHERE role = '{role}' AND state = '{state}'"
|
||||
}))
|
||||
for f in concurrent.futures.as_completed(futures):
|
||||
r = f.result()
|
||||
batch_ops += 1
|
||||
if "error" in r: batch_errors += 1
|
||||
|
||||
ops += batch_ops
|
||||
errors += batch_errors
|
||||
elapsed = time.time() - t_start
|
||||
remaining = duration - elapsed
|
||||
|
||||
if cycle % 30 == 0: # Log every ~30 cycles
|
||||
rate = ops / elapsed
|
||||
print(f" {elapsed/60:.0f}min: {ops} ops ({rate:.0f}/sec) errors={errors} remaining={remaining/60:.0f}min")
|
||||
|
||||
time.sleep(1) # 1 sec between batches
|
||||
|
||||
elapsed = time.time() - t_start
|
||||
rate = ops / elapsed
|
||||
print(f"\n═══ SUSTAINED LOAD COMPLETE ═══")
|
||||
print(f" Duration: {elapsed/60:.1f} minutes")
|
||||
print(f" Operations: {ops}")
|
||||
print(f" Rate: {rate:.0f} ops/sec")
|
||||
print(f" Errors: {errors} ({100*errors/max(ops,1):.1f}%)")
|
||||
print(f" STEP 5 COMPLETE")
|
||||
PYEOF
|
||||
|
||||
echo "report" > "$STATE"
|
||||
;;
|
||||
|
||||
report)
|
||||
log ""
|
||||
log "═══════════════════════════════════════════════════════"
|
||||
log " OVERNIGHT PROOF — COMPLETE"
|
||||
log "═══════════════════════════════════════════════════════"
|
||||
log " Step 1: 500K real embeddings via Ollama"
|
||||
log " Step 2: HNSW + Lance indexes on real data"
|
||||
log " Step 3: Recall measured on real embeddings"
|
||||
log " Step 4: 100 autonomous operations (no human)"
|
||||
log " Step 5: 30 min sustained concurrent load"
|
||||
log ""
|
||||
log " Full log: $LOG"
|
||||
log "═══════════════════════════════════════════════════════"
|
||||
echo "done" > "$STATE"
|
||||
;;
|
||||
|
||||
done)
|
||||
log "Overnight proof already complete."
|
||||
;;
|
||||
|
||||
esac
|
||||
Loading…
x
Reference in New Issue
Block a user