scripts: chicago analyzer field-name fixes + vectorize sanitizer hardening
Some checks failed
lakehouse/auditor 1 blocking issue: todo!() macro call in tests/real-world/scrum_master_pipeline.ts
Some checks failed
lakehouse/auditor 1 blocking issue: todo!() macro call in tests/real-world/scrum_master_pipeline.ts
Two small fixes surfaced during smoke testing: analyze_chicago_contracts.ts: permit field is contact_1_name not contact_1; reported_cost is integer-string. Fixed filter (was rejecting all 2853 permits) and contractor extraction (was empty). vectorize_raw_corpus.ts: sanitize() expanded to strip control chars + ALL backslashes (kills incomplete \uXXXX escapes) + UTF-16 surrogates (unpaired surrogates from emoji split by truncate boundary). Llm_team response cache had docs with all three pollution shapes. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
6ac7f61819
commit
779158a09b
@ -62,10 +62,11 @@ async function fetchPermits(n: number): Promise<Permit[]> {
|
|||||||
const proc = Bun.spawn(["mc", "cp", "-q", `${MC_ALIAS}/${RAW_BUCKET}/chicago/permits_2026-04-25.json`, local]);
|
const proc = Bun.spawn(["mc", "cp", "-q", `${MC_ALIAS}/${RAW_BUCKET}/chicago/permits_2026-04-25.json`, local]);
|
||||||
await proc.exited;
|
await proc.exited;
|
||||||
const all: Permit[] = JSON.parse(await Bun.file(local).text());
|
const all: Permit[] = JSON.parse(await Bun.file(local).text());
|
||||||
// Pick high-cost permits with named contractors — most interesting for staffing analysis
|
// Pick high-cost permits with named contractors — most interesting for staffing analysis.
|
||||||
|
// Field is `contact_1_name`, not `contact_1`. reported_cost is integer-like string.
|
||||||
const meaningful = all.filter(p =>
|
const meaningful = all.filter(p =>
|
||||||
p.reported_cost && Number(p.reported_cost) >= 100000 &&
|
p.reported_cost && Number(p.reported_cost) >= 100000 &&
|
||||||
(p.contact_1 || p.contact_2)
|
(p.contact_1_name || p.contact_2_name)
|
||||||
);
|
);
|
||||||
log(`raw permits: ${all.length} · meaningful (cost >= $100k + has contractor): ${meaningful.length}`);
|
log(`raw permits: ${all.length} · meaningful (cost >= $100k + has contractor): ${meaningful.length}`);
|
||||||
// Sample evenly across the meaningful set
|
// Sample evenly across the meaningful set
|
||||||
@ -79,8 +80,8 @@ async function fetchPermits(n: number): Promise<Permit[]> {
|
|||||||
|
|
||||||
function permitToText(p: Permit): string {
|
function permitToText(p: Permit): string {
|
||||||
const addr = `${p.street_number ?? ""} ${p.street_direction ?? ""} ${p.street_name ?? ""} ${p.suffix ?? ""}`.replace(/\s+/g, " ").trim();
|
const addr = `${p.street_number ?? ""} ${p.street_direction ?? ""} ${p.street_name ?? ""} ${p.suffix ?? ""}`.replace(/\s+/g, " ").trim();
|
||||||
const c1 = typeof p.contact_1 === "string" ? p.contact_1 : (p.contact_1?.name ?? p.contact_1?.contact_name ?? "");
|
const c1 = (p as any).contact_1_name ?? (typeof p.contact_1 === "string" ? p.contact_1 : (p.contact_1?.name ?? ""));
|
||||||
const c2 = typeof p.contact_2 === "string" ? p.contact_2 : (p.contact_2?.name ?? "");
|
const c2 = (p as any).contact_2_name ?? (typeof p.contact_2 === "string" ? p.contact_2 : (p.contact_2?.name ?? ""));
|
||||||
return [
|
return [
|
||||||
`Chicago Building Permit ${p.permit_ ?? "?"}`,
|
`Chicago Building Permit ${p.permit_ ?? "?"}`,
|
||||||
`Type: ${p.permit_type ?? "?"} · Status: ${p.permit_status ?? "?"}`,
|
`Type: ${p.permit_type ?? "?"} · Status: ${p.permit_status ?? "?"}`,
|
||||||
@ -187,7 +188,7 @@ async function analyzeOne(p: Permit, idx: number, total: number): Promise<any> {
|
|||||||
const permitText = permitToText(p);
|
const permitText = permitToText(p);
|
||||||
|
|
||||||
// Build matrix query: combine type + work description + contractor name for retrieval anchoring
|
// Build matrix query: combine type + work description + contractor name for retrieval anchoring
|
||||||
const c1 = typeof p.contact_1 === "string" ? p.contact_1 : (p.contact_1?.name ?? "");
|
const c1 = (p as any).contact_1_name ?? (typeof p.contact_1 === "string" ? p.contact_1 : (p.contact_1?.name ?? ""));
|
||||||
const matrixQuery = `${p.permit_type ?? ""} ${(p.work_description ?? "").slice(0, 300)} ${c1}`;
|
const matrixQuery = `${p.permit_type ?? ""} ${(p.work_description ?? "").slice(0, 300)} ${c1}`;
|
||||||
const matrix = await fetchMatrixHits(matrixQuery);
|
const matrix = await fetchMatrixHits(matrixQuery);
|
||||||
const corporaSummary = Object.entries(matrix.by_corpus).map(([k, v]) => `${k.split("_v")[0]}=${v}`).join(" ");
|
const corporaSummary = Object.entries(matrix.by_corpus).map(([k, v]) => `${k.split("_v")[0]}=${v}`).join(" ");
|
||||||
|
|||||||
@ -46,6 +46,18 @@ function truncate(s: string, n = 4000): string {
|
|||||||
return s == null ? "" : (s.length > n ? s.slice(0, n) : s);
|
return s == null ? "" : (s.length > n ? s.slice(0, n) : s);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Sanitize text before posting as JSON. Strips control chars and
|
||||||
|
// drops incomplete \uXXXX escape sequences which break Rust's
|
||||||
|
// serde JSON parser at the gateway. Llm_team response cache had
|
||||||
|
// rows with truncated \u escapes that 400'd the whole batch.
|
||||||
|
function sanitize(s: string): string {
|
||||||
|
if (!s) return "";
|
||||||
|
return s
|
||||||
|
.replace(/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]/g, "") // strip control chars
|
||||||
|
.replace(/\\/g, "") // strip ALL backslashes — kills any malformed \uXXXX in source data
|
||||||
|
.replace(/[\uD800-\uDFFF]/g, ""); // strip UTF-16 surrogates (lone ones from emoji split by truncate)
|
||||||
|
}
|
||||||
|
|
||||||
// ─── EXTRACTORS — one per source ───
|
// ─── EXTRACTORS — one per source ───
|
||||||
// Each shapes raw rows into {id, text} for the gateway's chunker.
|
// Each shapes raw rows into {id, text} for the gateway's chunker.
|
||||||
|
|
||||||
@ -129,13 +141,13 @@ function extractLlmTeamResponseCache(raw: string): Doc[] {
|
|||||||
const text = [
|
const text = [
|
||||||
`Cached response ${r.cache_key ?? r.id ?? i}`,
|
`Cached response ${r.cache_key ?? r.id ?? i}`,
|
||||||
`Created: ${r.created_at ?? "?"}`,
|
`Created: ${r.created_at ?? "?"}`,
|
||||||
r.prompt ? `Prompt: ${truncate(r.prompt, 1500)}` : "",
|
r.prompt ? `Prompt: ${sanitize(truncate(r.prompt, 1500))}` : "",
|
||||||
r.response ? `Response: ${truncate(r.response, 2500)}` : "",
|
r.response ? `Response: ${sanitize(truncate(r.response, 2500))}` : "",
|
||||||
r.model ? `Model: ${r.model}` : "",
|
r.model ? `Model: ${r.model}` : "",
|
||||||
].filter(Boolean).join("\n");
|
].filter(Boolean).join("\n");
|
||||||
return { id: `resp_${r.cache_key ?? r.id ?? i}`, text };
|
return { id: `resp_${r.cache_key ?? r.id ?? i}`, text };
|
||||||
} catch {
|
} catch {
|
||||||
return { id: `resp_${i}`, text: line.slice(0, 2000) };
|
return { id: `resp_${i}`, text: sanitize(line.slice(0, 2000)) };
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user