scripts: chicago analyzer field-name fixes + vectorize sanitizer hardening
Some checks failed
lakehouse/auditor 1 blocking issue: todo!() macro call in tests/real-world/scrum_master_pipeline.ts

Two small fixes surfaced during smoke testing:

analyze_chicago_contracts.ts: permit field is contact_1_name not
contact_1; reported_cost is integer-string. Fixed filter (was rejecting
all 2853 permits) and contractor extraction (was empty).

vectorize_raw_corpus.ts: sanitize() expanded to strip control chars +
ALL backslashes (kills incomplete \uXXXX escapes) + UTF-16 surrogates
(unpaired surrogates from emoji split by truncate boundary). Llm_team
response cache had docs with all three pollution shapes.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
root 2026-04-25 19:34:45 -05:00
parent 6ac7f61819
commit 779158a09b
2 changed files with 21 additions and 8 deletions

View File

@ -62,10 +62,11 @@ async function fetchPermits(n: number): Promise<Permit[]> {
const proc = Bun.spawn(["mc", "cp", "-q", `${MC_ALIAS}/${RAW_BUCKET}/chicago/permits_2026-04-25.json`, local]); const proc = Bun.spawn(["mc", "cp", "-q", `${MC_ALIAS}/${RAW_BUCKET}/chicago/permits_2026-04-25.json`, local]);
await proc.exited; await proc.exited;
const all: Permit[] = JSON.parse(await Bun.file(local).text()); const all: Permit[] = JSON.parse(await Bun.file(local).text());
// Pick high-cost permits with named contractors — most interesting for staffing analysis // Pick high-cost permits with named contractors — most interesting for staffing analysis.
// Field is `contact_1_name`, not `contact_1`. reported_cost is integer-like string.
const meaningful = all.filter(p => const meaningful = all.filter(p =>
p.reported_cost && Number(p.reported_cost) >= 100000 && p.reported_cost && Number(p.reported_cost) >= 100000 &&
(p.contact_1 || p.contact_2) (p.contact_1_name || p.contact_2_name)
); );
log(`raw permits: ${all.length} · meaningful (cost >= $100k + has contractor): ${meaningful.length}`); log(`raw permits: ${all.length} · meaningful (cost >= $100k + has contractor): ${meaningful.length}`);
// Sample evenly across the meaningful set // Sample evenly across the meaningful set
@ -79,8 +80,8 @@ async function fetchPermits(n: number): Promise<Permit[]> {
function permitToText(p: Permit): string { function permitToText(p: Permit): string {
const addr = `${p.street_number ?? ""} ${p.street_direction ?? ""} ${p.street_name ?? ""} ${p.suffix ?? ""}`.replace(/\s+/g, " ").trim(); const addr = `${p.street_number ?? ""} ${p.street_direction ?? ""} ${p.street_name ?? ""} ${p.suffix ?? ""}`.replace(/\s+/g, " ").trim();
const c1 = typeof p.contact_1 === "string" ? p.contact_1 : (p.contact_1?.name ?? p.contact_1?.contact_name ?? ""); const c1 = (p as any).contact_1_name ?? (typeof p.contact_1 === "string" ? p.contact_1 : (p.contact_1?.name ?? ""));
const c2 = typeof p.contact_2 === "string" ? p.contact_2 : (p.contact_2?.name ?? ""); const c2 = (p as any).contact_2_name ?? (typeof p.contact_2 === "string" ? p.contact_2 : (p.contact_2?.name ?? ""));
return [ return [
`Chicago Building Permit ${p.permit_ ?? "?"}`, `Chicago Building Permit ${p.permit_ ?? "?"}`,
`Type: ${p.permit_type ?? "?"} · Status: ${p.permit_status ?? "?"}`, `Type: ${p.permit_type ?? "?"} · Status: ${p.permit_status ?? "?"}`,
@ -187,7 +188,7 @@ async function analyzeOne(p: Permit, idx: number, total: number): Promise<any> {
const permitText = permitToText(p); const permitText = permitToText(p);
// Build matrix query: combine type + work description + contractor name for retrieval anchoring // Build matrix query: combine type + work description + contractor name for retrieval anchoring
const c1 = typeof p.contact_1 === "string" ? p.contact_1 : (p.contact_1?.name ?? ""); const c1 = (p as any).contact_1_name ?? (typeof p.contact_1 === "string" ? p.contact_1 : (p.contact_1?.name ?? ""));
const matrixQuery = `${p.permit_type ?? ""} ${(p.work_description ?? "").slice(0, 300)} ${c1}`; const matrixQuery = `${p.permit_type ?? ""} ${(p.work_description ?? "").slice(0, 300)} ${c1}`;
const matrix = await fetchMatrixHits(matrixQuery); const matrix = await fetchMatrixHits(matrixQuery);
const corporaSummary = Object.entries(matrix.by_corpus).map(([k, v]) => `${k.split("_v")[0]}=${v}`).join(" "); const corporaSummary = Object.entries(matrix.by_corpus).map(([k, v]) => `${k.split("_v")[0]}=${v}`).join(" ");

View File

@ -46,6 +46,18 @@ function truncate(s: string, n = 4000): string {
return s == null ? "" : (s.length > n ? s.slice(0, n) : s); return s == null ? "" : (s.length > n ? s.slice(0, n) : s);
} }
// Sanitize text before posting as JSON. Strips control chars and
// drops incomplete \uXXXX escape sequences which break Rust's
// serde JSON parser at the gateway. Llm_team response cache had
// rows with truncated \u escapes that 400'd the whole batch.
function sanitize(s: string): string {
if (!s) return "";
return s
.replace(/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]/g, "") // strip control chars
.replace(/\\/g, "") // strip ALL backslashes — kills any malformed \uXXXX in source data
.replace(/[\uD800-\uDFFF]/g, ""); // strip UTF-16 surrogates (lone ones from emoji split by truncate)
}
// ─── EXTRACTORS — one per source ─── // ─── EXTRACTORS — one per source ───
// Each shapes raw rows into {id, text} for the gateway's chunker. // Each shapes raw rows into {id, text} for the gateway's chunker.
@ -129,13 +141,13 @@ function extractLlmTeamResponseCache(raw: string): Doc[] {
const text = [ const text = [
`Cached response ${r.cache_key ?? r.id ?? i}`, `Cached response ${r.cache_key ?? r.id ?? i}`,
`Created: ${r.created_at ?? "?"}`, `Created: ${r.created_at ?? "?"}`,
r.prompt ? `Prompt: ${truncate(r.prompt, 1500)}` : "", r.prompt ? `Prompt: ${sanitize(truncate(r.prompt, 1500))}` : "",
r.response ? `Response: ${truncate(r.response, 2500)}` : "", r.response ? `Response: ${sanitize(truncate(r.response, 2500))}` : "",
r.model ? `Model: ${r.model}` : "", r.model ? `Model: ${r.model}` : "",
].filter(Boolean).join("\n"); ].filter(Boolean).join("\n");
return { id: `resp_${r.cache_key ?? r.id ?? i}`, text }; return { id: `resp_${r.cache_key ?? r.id ?? i}`, text };
} catch { } catch {
return { id: `resp_${i}`, text: line.slice(0, 2000) }; return { id: `resp_${i}`, text: sanitize(line.slice(0, 2000)) };
} }
}); });
} }