Three new pieces, executed in order:
scripts/dump_raw_corpus.sh
- One-shot bash that creates MinIO bucket `raw` and uploads all
testing corpora as a persistent immutable test set. 365 MB total
across 5 prefixes (chicago, entities, sec, staffing, llm_team)
+ MANIFEST.json. Sources: workers_500k.parquet (309 MB),
resumes.parquet, entities.jsonl, sec_company_tickers.json,
Chicago permits last 30d (2,853 records, 5.4 MB), 9 LLM Team
Postgres tables dumped via row_to_json.
scripts/vectorize_raw_corpus.ts
- Bun script that fetches each raw-bucket source via mc, runs a
source-specific extractor into {id, text} docs, posts to
/vectors/index, polls job to completion. Verified results:
chicago_permits_v1: 3,420 chunks
entity_brief_v1: 634 chunks
sec_tickers_v1: 10,341 chunks (after extractor fix for
wrapped {rows: {...}} JSON shape)
llm_team_runs_v1: in flight, 19K+ chunks
llm_team_response_cache_v1: queued
scripts/analyze_chicago_contracts.ts
- Real inference pipeline that picks N high-cost permits with
named contractors from the raw bucket, queries all 6 contract-
analysis corpora in parallel via /vectors/search, builds a
MATRIX CONTEXT preamble, calls Grok 4.1 fast for structured
staffing analysis, hand-reviews each via observer /review,
appends to data/_kb/contract_analyses.jsonl.
tests/real-world/scrum_master_pipeline.ts
- MATRIX_CORPORA_FOR_TASK extended with two new task classes:
contract_analysis (chicago + entity_brief + sec + llm_team_runs
+ llm_team_response_cache + distilled_procedural)
staffing_inference (workers_500k_v8 + entity_brief + chicago
+ llm_team_runs + distilled_procedural)
scrum_review unchanged.
This is the first time the matrix architecture operates on real
ingested data instead of code-review smoke tests.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
248 lines
11 KiB
TypeScript
248 lines
11 KiB
TypeScript
#!/usr/bin/env bun
|
|
// Vectorize each raw-bucket corpus into a queryable matrix index.
|
|
// Reads from local raw/ dump (bun fetch from MinIO), shapes into
|
|
// {id, text} docs, POSTs to gateway /vectors/index, polls job to done.
|
|
//
|
|
// Targets one index per source with stable names so MATRIX_CORPORA_FOR_TASK
|
|
// can reference them. Idempotent: re-running rebuilds with a fresh _v2.
|
|
//
|
|
// Usage:
|
|
// bun run scripts/vectorize_raw_corpus.ts [source...]
|
|
// Default: runs all sources in order. Sources: chicago, entities, sec, llm_team_runs, llm_team_response
|
|
|
|
const GATEWAY = process.env.LAKEHOUSE_URL ?? "http://localhost:3100";
|
|
const RAW_BUCKET = "raw";
|
|
const MC_ALIAS = "local";
|
|
const STAGE_DIR = "/tmp/vectorize_raw";
|
|
|
|
interface Doc { id: string; text: string }
|
|
interface SourceSpec {
|
|
name: string; // CLI flag
|
|
index_name: string; // /vectors/index target
|
|
s3_key: string; // path under raw/
|
|
source_label: string; // gateway "source" field
|
|
chunk_size?: number;
|
|
overlap?: number;
|
|
extractor: (raw: string) => Doc[];
|
|
}
|
|
|
|
// Spawn mc to copy from S3 → local stage so we can read it
|
|
async function fetchFromRaw(key: string): Promise<string> {
|
|
const fs = await import("node:fs/promises");
|
|
await fs.mkdir(STAGE_DIR, { recursive: true });
|
|
const local = `${STAGE_DIR}/${key.replace(/\//g, "_")}`;
|
|
const proc = Bun.spawn(["mc", "cp", "-q", `${MC_ALIAS}/${RAW_BUCKET}/${key}`, local]);
|
|
await proc.exited;
|
|
if (proc.exitCode !== 0) throw new Error(`mc cp failed for ${key}`);
|
|
return local;
|
|
}
|
|
|
|
async function readJsonl(path: string): Promise<any[]> {
|
|
const text = await Bun.file(path).text();
|
|
return text.split("\n").filter(l => l.trim()).map(l => JSON.parse(l));
|
|
}
|
|
|
|
function truncate(s: string, n = 4000): string {
|
|
return s == null ? "" : (s.length > n ? s.slice(0, n) : s);
|
|
}
|
|
|
|
// ─── EXTRACTORS — one per source ───
|
|
// Each shapes raw rows into {id, text} for the gateway's chunker.
|
|
|
|
function extractChicagoPermits(raw: string): Doc[] {
|
|
const arr = JSON.parse(raw);
|
|
return arr.map((p: any, i: number) => {
|
|
const text = [
|
|
`Permit ${p.permit_ ?? p.permit_number ?? `unknown_${i}`}`,
|
|
`Type: ${p.permit_type ?? "?"} Status: ${p.permit_status ?? "?"}`,
|
|
`Address: ${p.street_number ?? ""} ${p.street_direction ?? ""} ${p.street_name ?? ""} ${p.suffix ?? ""}`.trim(),
|
|
`Issued: ${p.issue_date ?? "?"} Applied: ${p.application_start_date ?? "?"}`,
|
|
`Work: ${truncate(p.work_description ?? "", 800)}`,
|
|
`Estimated cost: ${p.reported_cost ?? p.estimated_cost ?? "?"}`,
|
|
`Contractors: ${p.contact_1 ?? ""} | ${p.contact_2 ?? ""}`,
|
|
`Owner: ${p.contact_3_name ?? ""} (${p.contact_3_type ?? ""})`,
|
|
`Subtypes: ${p.subtotal_paid ?? ""} community area=${p.community_area ?? ""} ward=${p.ward ?? ""}`,
|
|
].filter(Boolean).join("\n");
|
|
return { id: `permit_${p.permit_ ?? p.id ?? i}`, text };
|
|
});
|
|
}
|
|
|
|
function extractEntities(raw: string): Doc[] {
|
|
return raw.split("\n").filter(l => l.trim()).map((line, i) => {
|
|
try {
|
|
const e = JSON.parse(line);
|
|
const name = e.normalized_name ?? e.name ?? e.display_name ?? `entity_${i}`;
|
|
const text = [
|
|
`Entity: ${name}`,
|
|
`Display: ${e.display_name ?? name}`,
|
|
e.ticker ? `Ticker: ${e.ticker}` : "",
|
|
e.cik ? `CIK: ${e.cik}` : "",
|
|
e.aliases ? `Aliases: ${(e.aliases ?? []).join(", ")}` : "",
|
|
e.last_seen ? `Last seen: ${e.last_seen}` : "",
|
|
e.notes ? `Notes: ${truncate(JSON.stringify(e.notes), 600)}` : "",
|
|
`Raw: ${truncate(JSON.stringify(e), 1500)}`,
|
|
].filter(Boolean).join("\n");
|
|
return { id: `entity_${name}_${i}`, text };
|
|
} catch {
|
|
return { id: `entity_${i}`, text: line.slice(0, 1000) };
|
|
}
|
|
});
|
|
}
|
|
|
|
function extractSecTickers(raw: string): Doc[] {
|
|
// SEC tickers JSON: {"_fetched_at": ..., "rows": {"0": {cik_str, ticker, title}, ...}}
|
|
const obj = JSON.parse(raw);
|
|
// The actual rows are under .rows; fall back to top-level if no wrapper.
|
|
const rows = obj.rows ?? obj;
|
|
return Object.values(rows)
|
|
.filter((r: any) => r && typeof r === "object" && r.ticker)
|
|
.map((row: any, i: number) => ({
|
|
id: `sec_${row.ticker ?? i}`,
|
|
text: `Ticker: ${row.ticker}\nCompany: ${row.title ?? "?"}\nCIK: ${row.cik_str ?? "?"}`,
|
|
}));
|
|
}
|
|
|
|
function extractLlmTeamRuns(raw: string): Doc[] {
|
|
return raw.split("\n").filter(l => l.trim()).map((line, i) => {
|
|
try {
|
|
const r = JSON.parse(line);
|
|
const text = [
|
|
`Team run ${r.id ?? i}`,
|
|
`Mode: ${r.mode ?? "?"} Created: ${r.created_at ?? "?"}`,
|
|
r.prompt ? `Prompt: ${truncate(r.prompt, 1200)}` : "",
|
|
r.input ? `Input: ${truncate(typeof r.input === "string" ? r.input : JSON.stringify(r.input), 1200)}` : "",
|
|
r.output ? `Output: ${truncate(typeof r.output === "string" ? r.output : JSON.stringify(r.output), 2000)}` : "",
|
|
r.result ? `Result: ${truncate(typeof r.result === "string" ? r.result : JSON.stringify(r.result), 2000)}` : "",
|
|
r.metadata ? `Meta: ${truncate(JSON.stringify(r.metadata), 600)}` : "",
|
|
].filter(Boolean).join("\n");
|
|
return { id: `team_run_${r.id ?? i}`, text };
|
|
} catch {
|
|
return { id: `team_run_${i}`, text: line.slice(0, 2000) };
|
|
}
|
|
});
|
|
}
|
|
|
|
function extractLlmTeamResponseCache(raw: string): Doc[] {
|
|
return raw.split("\n").filter(l => l.trim()).map((line, i) => {
|
|
try {
|
|
const r = JSON.parse(line);
|
|
const text = [
|
|
`Cached response ${r.cache_key ?? r.id ?? i}`,
|
|
`Created: ${r.created_at ?? "?"}`,
|
|
r.prompt ? `Prompt: ${truncate(r.prompt, 1500)}` : "",
|
|
r.response ? `Response: ${truncate(r.response, 2500)}` : "",
|
|
r.model ? `Model: ${r.model}` : "",
|
|
].filter(Boolean).join("\n");
|
|
return { id: `resp_${r.cache_key ?? r.id ?? i}`, text };
|
|
} catch {
|
|
return { id: `resp_${i}`, text: line.slice(0, 2000) };
|
|
}
|
|
});
|
|
}
|
|
|
|
const SOURCES: SourceSpec[] = [
|
|
{ name: "chicago", index_name: "chicago_permits_v1", s3_key: "chicago/permits_2026-04-25.json",
|
|
source_label: "chicago_permits", chunk_size: 600, overlap: 80, extractor: extractChicagoPermits },
|
|
{ name: "entities", index_name: "entity_brief_v1", s3_key: "entities/entities.jsonl",
|
|
source_label: "entity_brief", chunk_size: 500, overlap: 60, extractor: extractEntities },
|
|
{ name: "sec", index_name: "sec_tickers_v1", s3_key: "sec/company_tickers.json",
|
|
source_label: "sec_tickers", chunk_size: 200, overlap: 20, extractor: extractSecTickers },
|
|
{ name: "llm_team_runs", index_name: "llm_team_runs_v1", s3_key: "llm_team/team_runs.jsonl",
|
|
source_label: "llm_team_runs", chunk_size: 800, overlap: 100, extractor: extractLlmTeamRuns },
|
|
{ name: "llm_team_response", index_name: "llm_team_response_cache_v1", s3_key: "llm_team/response_cache.jsonl",
|
|
source_label: "llm_team_response_cache", chunk_size: 800, overlap: 100, extractor: extractLlmTeamResponseCache },
|
|
];
|
|
|
|
async function vectorizeOne(spec: SourceSpec): Promise<{ ok: boolean; chunks: number; job_id?: string; err?: string }> {
|
|
const t0 = Date.now();
|
|
console.log(`\n━━━ ${spec.name} → ${spec.index_name} ━━━`);
|
|
console.log(`fetching s3://${RAW_BUCKET}/${spec.s3_key}`);
|
|
|
|
let local: string;
|
|
try { local = await fetchFromRaw(spec.s3_key); }
|
|
catch (e: any) { return { ok: false, chunks: 0, err: `fetch: ${e.message}` }; }
|
|
|
|
console.log(`reading + extracting...`);
|
|
const raw = await Bun.file(local).text();
|
|
const docs = spec.extractor(raw);
|
|
if (docs.length === 0) return { ok: false, chunks: 0, err: "0 docs after extraction" };
|
|
console.log(` ${docs.length} docs (avg ${Math.round(docs.reduce((a, d) => a + d.text.length, 0) / docs.length)} chars)`);
|
|
|
|
console.log(`POST /vectors/index ${spec.index_name} ...`);
|
|
const resp = await fetch(`${GATEWAY}/vectors/index`, {
|
|
method: "POST",
|
|
headers: { "content-type": "application/json" },
|
|
body: JSON.stringify({
|
|
index_name: spec.index_name,
|
|
source: spec.source_label,
|
|
documents: docs,
|
|
chunk_size: spec.chunk_size,
|
|
overlap: spec.overlap,
|
|
}),
|
|
signal: AbortSignal.timeout(300000),
|
|
});
|
|
|
|
if (!resp.ok) {
|
|
const body = await resp.text();
|
|
return { ok: false, chunks: 0, err: `HTTP ${resp.status}: ${body.slice(0, 300)}` };
|
|
}
|
|
const j: any = await resp.json();
|
|
const ms = Date.now() - t0;
|
|
console.log(` ✓ submitted: job=${j.job_id} chunks=${j.chunks} (extract+submit ${(ms/1000).toFixed(1)}s)`);
|
|
return { ok: true, chunks: j.chunks, job_id: j.job_id };
|
|
}
|
|
|
|
async function pollJob(jobId: string): Promise<{ status: string; processed: number; total: number }> {
|
|
const r = await fetch(`${GATEWAY}/vectors/jobs/${jobId}`, { signal: AbortSignal.timeout(5000) });
|
|
if (!r.ok) return { status: "unknown", processed: 0, total: 0 };
|
|
const j: any = await r.json();
|
|
return { status: j.status ?? "?", processed: j.processed ?? 0, total: j.total ?? 0 };
|
|
}
|
|
|
|
async function waitForJob(jobId: string, label: string, maxSec = 600): Promise<void> {
|
|
const t0 = Date.now();
|
|
let lastLog = 0;
|
|
while ((Date.now() - t0) / 1000 < maxSec) {
|
|
const s = await pollJob(jobId);
|
|
if (s.status === "complete" || s.status === "completed" || s.status === "done") {
|
|
console.log(` ✓ ${label} job ${jobId.slice(0,8)} complete (${s.processed}/${s.total} in ${((Date.now()-t0)/1000).toFixed(0)}s)`);
|
|
return;
|
|
}
|
|
if (s.status === "failed" || s.status === "error") {
|
|
console.log(` ✗ ${label} job ${jobId.slice(0,8)} failed at ${s.processed}/${s.total}`);
|
|
return;
|
|
}
|
|
if (Date.now() - lastLog > 15000) {
|
|
console.log(` · ${label} progress ${s.processed}/${s.total} (${s.status})`);
|
|
lastLog = Date.now();
|
|
}
|
|
await new Promise(r => setTimeout(r, 3000));
|
|
}
|
|
console.log(` ⚠ ${label} job ${jobId.slice(0,8)} still running after ${maxSec}s — leaving in background`);
|
|
}
|
|
|
|
async function main() {
|
|
const args = process.argv.slice(2);
|
|
const targets = args.length > 0 ? SOURCES.filter(s => args.includes(s.name)) : SOURCES;
|
|
console.log(`Vectorizing ${targets.length} source(s): ${targets.map(t => t.name).join(", ")}`);
|
|
|
|
const results: Array<{ name: string; result: any }> = [];
|
|
for (const spec of targets) {
|
|
try {
|
|
const r = await vectorizeOne(spec);
|
|
if (r.ok && r.job_id) await waitForJob(r.job_id, spec.name);
|
|
results.push({ name: spec.name, result: r });
|
|
} catch (e: any) {
|
|
console.error(`! ${spec.name}: ${e.message}`);
|
|
results.push({ name: spec.name, result: { ok: false, err: e.message } });
|
|
}
|
|
}
|
|
|
|
console.log(`\n━━━ SUMMARY ━━━`);
|
|
for (const { name, result } of results) {
|
|
console.log(` ${result.ok ? "✓" : "✗"} ${name.padEnd(20)} chunks=${result.chunks ?? 0} ${result.err ? `err=${result.err}` : ""}`);
|
|
}
|
|
}
|
|
|
|
main().catch(e => { console.error(`FATAL: ${e.message}`); process.exit(1); });
|