// Project Index — per-Chicago-permit public-data signal portfolio. // (Originally called "ETF" until we realized that's a SEC-regulated term; // this is NOT an investment product. It's a custom index of Chicago // building-related signals.) // // Pulls violation & corp-registry signals keyed to the contractor/applicant // names extracted from public building permits. The point is to surface // the kinds of signals a staffer or investor would want BEFORE committing // workers or capital: OSHA history, corporate-registry status, federal // contracts, parent-company chain, building violations, and (eventually) // LLC-shuffle detection where a firm with a bad record dissolves and // reforms under a new name. // // Sources right now: // - OSHA Enforcement search (www.osha.gov/ords/imis/establishment.search) // — HTML only, scraped. Reachable from our ASN (verified). // - Illinois Secretary of State — BLOCKED from our datacenter ASN. // We return a structured placeholder with a honest "source_unreachable" // flag so the UI can show "awaiting source" rather than fake data. // - OpenCorporates — requires API token even for free tier (not self-serve). // // Caching: JSONL at data/_entity_cache/entities.jsonl. Keyed by // normalized company name. 30-day TTL because entity status and // violation history don't change fast. // // Politeness: OSHA gets max 1 req/sec (with jitter) and a descriptive UA. import { readFile, writeFile, mkdir } from "node:fs/promises"; import { existsSync } from "node:fs"; import { join } from "node:path"; import { findTifDistrict } from "./tif_polygons.js"; const CACHE_DIR = "/home/profit/lakehouse/data/_entity_cache"; const CACHE_FILE = join(CACHE_DIR, "entities.jsonl"); const DEFAULT_TTL_MS = 30 * 24 * 60 * 60 * 1000; const UA = "lakehouse-staffing-copilot/1.0 (contact: ops@devop.live; public-permit-enrichment)"; const OSHA_MIN_GAP_MS = 1200; let lastOshaAt = 0; async function oshaGate() { const gap = Date.now() - lastOshaAt; if (gap < OSHA_MIN_GAP_MS) { await new Promise((r) => setTimeout(r, OSHA_MIN_GAP_MS - gap + Math.random() * 200)); } lastOshaAt = Date.now(); } // ─── Types ──────────────────────────────────────────────────────── export type OshaInspection = { id: string; date: string; state: string; type: string; scope: string; naics: string; establishment: string; detail_url: string; }; export type OshaBrief = { source: "osha"; fetched_at: string; searched_name: string; normalized_name: string; source_url: string; status: "ok" | "no_match" | "error"; inspection_count: number; most_recent_date: string | null; recent_inspections: OshaInspection[]; states_seen: string[]; error?: string; }; export type IlsosBrief = { source: "ilsos"; fetched_at: string; searched_name: string; status: "source_unreachable" | "ok" | "no_match" | "error"; reason?: string; entity_name?: string; file_number?: string; status_text?: string; formation_date?: string; registered_agent?: string; principal_address?: string; officers?: string[]; error?: string; }; // Ticker brief — SEC EDGAR (name → CIK → ticker) + Stooq (live price). // Both are free & no-auth. SEC is the authoritative name→ticker source // (company_tickers.json covers 10K+ US-listed issuers); Stooq layers // current price on top. We intentionally skip Yahoo Finance — its free // endpoint has been auth-walled since 2024. // // "Market cap proxy" is price × volume rather than shares_outstanding × // price because shares_outstanding requires scraping an SEC filing for // each issuer. Good enough for ranking "most profitable related company" // until we wire XBRL facts in pass 2. export type TickerBrief = { source: "sec+stooq"; fetched_at: string; searched_name: string; status: "ok" | "no_match" | "error"; ticker?: string; company_name?: string; cik?: string; exchange?: string; sic?: string; sic_description?: string; // Live quote fields (may be absent on holidays/weekends) price?: number; price_date?: string; volume?: number; open?: number; high?: number; low?: number; day_change_pct?: number; // Ranking hint — price × volume, used to sort tickers in the portfolio. cap_proxy?: number; sec_url?: string; stooq_url?: string; error?: string; }; // Property owner brief — Cook County Assessor parcels dataset (c49d-89sn). // Lookup by property address → PIN → owner mailing address. The mailing // address is the key signal: it tells you who the assessor mails the // tax bill to. PO Box in Minneapolis MN ~= TARGET CORP HQ (verified // 1101 W Jackson Blvd → PO Box 9456 Minneapolis 55440 = Target). // Free, no auth, Socrata. export type PropertyOwnerBrief = { source: "cook_county_assessor"; fetched_at: string; searched_address: string; status: "ok" | "no_match" | "error"; pin?: string; property_address?: string; property_zip?: string; // Mailing addr is the smoking gun for ownership inference. mailing_address?: string; mailing_city?: string; mailing_state?: string; mailing_zip?: string; township?: string; ward?: string; longitude?: string; latitude?: string; source_url?: string; error?: string; }; // Building violations on the property. Stop-work orders, code violations, // failed inspections — direct "is this build behind schedule" signals. // Dataset 22u3-xenr. export type PropertyViolationsBrief = { source: "chicago_building_violations"; fetched_at: string; searched_address: string; status: "ok" | "error"; total_violations: number; open_violations: number; stop_work_orders: number; most_recent_date?: string; recent_violations: Array<{ date: string; status: string; description: string; department: string; }>; error?: string; }; // Debarment / Excluded Parties — federal SAM.gov + Illinois Dept of Labor. // Both are "free in spirit" but blocked behind some friction: // - SAM.gov v3 API requires registered API key (free at sam.gov but // requires email + entity registration; can't be self-served from // our server without a one-time human signup). // - IDOL prevailing-wage debarment list is published as HTML/PDF on // labor.illinois.gov, no API — needs a scraper + periodic refresh. // Returning structured "needs_setup" placeholder so the UI can show // the path forward rather than fake clean. export type DebarmentBrief = { source: "sam_gov+idol"; fetched_at: string; searched_name: string; status: "ok" | "no_match" | "needs_setup"; reason?: string; sam_excluded?: boolean; idol_debarred?: boolean; excluded_until?: string; agency?: string; }; export async function fetchDebarmentBrief(name: string): Promise { const now = new Date().toISOString(); const cached = await cacheGet(normalizeEntityName(name), "debarment"); if (cached) return cached; const brief: DebarmentBrief = { source: "sam_gov+idol", fetched_at: now, searched_name: name, status: "needs_setup", reason: "SAM.gov v3 needs registered API key (free signup at sam.gov). IDOL prevailing-wage debarment list is HTML-only at labor.illinois.gov — needs scraper + periodic refresh.", }; return brief; } // SEC EDGAR 10-K Exhibit 21 — parent/subsidiary tree. The "private GC → // public parent ticker" chain. Each public company files Exhibit 21 of // their 10-K listing all subsidiaries. We resolve by: // 1. Fetch SEC company submissions for a known parent ticker // 2. Find the most recent 10-K filing // 3. Fetch the Exhibit 21 attachment HTML // 4. Parse subsidiary names // // Reverse direction (subsidiary name → parent) requires either: // (a) Pre-build an index of "subsidiary → parent" by walking every // 10-K Exhibit 21 in our SEC index (~10K filings, large) // (b) Use a known mapping for high-volume parents (Turner→ACS, // Skanska→SKA-B.ST, Balfour Beatty→BBY.L, etc) // (b) is pragmatic for v1; (a) is the proper long-term answer. export type ParentLinkBrief = { source: "sec_exhibit21+wikidata"; fetched_at: string; searched_name: string; status: "ok" | "no_link" | "needs_index"; reason?: string; parent_name?: string; parent_ticker?: string; parent_exchange?: string; parent_country?: string; link_source?: string; }; // Hand-curated map of well-known private contractor → public parent. // The cleanest seed of the index — covers the dominant Chicago GCs. // Each entry includes the source citation so a human can verify. const KNOWN_PARENT_MAP: Record = { "TURNER CONSTRUCTION": { source: "sec_exhibit21+wikidata", fetched_at: "", searched_name: "TURNER CONSTRUCTION", status: "ok", parent_name: "Hochtief AG (subsidiary of ACS Group)", parent_ticker: "HOC.DE", parent_exchange: "Frankfurt (also ACS:MC.MC Madrid)", parent_country: "DE/ES", link_source: "Hochtief 2024 Annual Report — Turner is wholly-owned via Hochtief Americas", }, "SKANSKA USA": { source: "sec_exhibit21+wikidata", fetched_at: "", searched_name: "SKANSKA USA", status: "ok", parent_name: "Skanska AB", parent_ticker: "SKA-B.ST", parent_exchange: "Stockholm", parent_country: "SE", link_source: "Skanska 2024 Annual Report Exhibit", }, "WALSH GROUP": { source: "sec_exhibit21+wikidata", fetched_at: "", searched_name: "WALSH GROUP", status: "no_link", reason: "Walsh Group is private (Chicago — Walsh family-owned).", }, "POWER CONSTRUCTION": { source: "sec_exhibit21+wikidata", fetched_at: "", searched_name: "POWER CONSTRUCTION", status: "no_link", reason: "Power Construction Co is private (Schaumburg, IL — family-owned).", }, "PEPPER CONSTRUCTION": { source: "sec_exhibit21+wikidata", fetched_at: "", searched_name: "PEPPER CONSTRUCTION", status: "no_link", reason: "Pepper Construction Group is private (Barrington, IL — Pepper family).", }, "CLAYCO": { source: "sec_exhibit21+wikidata", fetched_at: "", searched_name: "CLAYCO", status: "no_link", reason: "Clayco is private (Chicago — Bob Clark family).", }, "MCHUGH CONSTRUCTION": { source: "sec_exhibit21+wikidata", fetched_at: "", searched_name: "MCHUGH CONSTRUCTION", status: "no_link", reason: "James McHugh Construction is private.", }, "BULLEY ANDREWS": { source: "sec_exhibit21+wikidata", fetched_at: "", searched_name: "BULLEY ANDREWS", status: "no_link", reason: "Bulley & Andrews is private (Chicago, family-owned since 1891).", }, "LEND LEASE": { source: "sec_exhibit21+wikidata", fetched_at: "", searched_name: "LEND LEASE", status: "ok", parent_name: "Lendlease Group", parent_ticker: "LLC.AX", parent_exchange: "Sydney (ASX)", parent_country: "AU", link_source: "Lendlease Annual Report — global construction division", }, "GILBANE": { source: "sec_exhibit21+wikidata", fetched_at: "", searched_name: "GILBANE", status: "no_link", reason: "Gilbane is private (Providence RI — Gilbane family, 6th gen).", }, "WHITING TURNER": { source: "sec_exhibit21+wikidata", fetched_at: "", searched_name: "WHITING TURNER", status: "no_link", reason: "Whiting-Turner is private (Baltimore MD).", }, "STO BUILDING GROUP": { source: "sec_exhibit21+wikidata", fetched_at: "", searched_name: "STO BUILDING GROUP", status: "no_link", reason: "STO Building Group is private (Structure Tone family of firms).", }, }; export async function fetchParentLink(name: string): Promise { const now = new Date().toISOString(); const cached = await cacheGet(normalizeEntityName(name), "parent" as any); if (cached) return cached; // Match against curated map first. const upper = name.toUpperCase(); for (const [key, val] of Object.entries(KNOWN_PARENT_MAP)) { if (upper.includes(key)) { return { ...val, fetched_at: now, searched_name: name }; } } return { source: "sec_exhibit21+wikidata", fetched_at: now, searched_name: name, status: "needs_index", reason: "Not in curated parent map. Full SEC Exhibit 21 walk would build a complete subsidiary→parent index — queued for batch 5.", }; } // Site Context — non-contractor signals tied to the property's location. // TIF status (public subsidy), landmark (preservation = longer schedule), // community area + median income, lat/long. Pulls from Chicago Open Data. export type SiteContextBrief = { source: "chicago_socrata"; fetched_at: string; searched_address: string; status: "ok" | "no_match" | "error"; in_tif_district?: boolean; tif_district_name?: string; is_landmark?: boolean; landmark_name?: string; community_area?: string; community_area_number?: string; ward?: string; longitude?: string; latitude?: string; // Transit access: distance to nearest CTA L station (meters). // Affects worker access — sub-1500m means crews can ride transit // instead of needing parking/shuttle. nearest_cta_station?: string; nearest_cta_lines?: string; nearest_cta_distance_m?: number; // Permits within 800m issued in last 90d. High count = active labor // competition for the same crew pool. nearby_permits_90d?: number; nearby_permits_value_90d?: number; error?: string; }; // MBE/WBE/DBE diversity certification — Chicago Dept of Procurement. // Note: as of 2026-04, the catalog-listed datasets (2iq3-bugw, 69yt-tb5j, // ci93-uc8s, etc.) all return 404 on the resource endpoint despite // appearing in the catalog search. Likely archived or auth-gated. // Returning needs_setup until alternate path identified. export type DiversityCertBrief = { source: "chicago_dps_economic_inclusion"; fetched_at: string; searched_name: string; status: "ok" | "no_match" | "error" | "needs_setup"; reason?: string; certifications: Array<{ category: string; expiration?: string; type?: string }>; source_url?: string; error?: string; }; // News mentions — Google News RSS (free, no auth). Returns recent // headlines mentioning the contractor name. Quick reputation signal. export type NewsBrief = { source: "google_news_rss"; fetched_at: string; searched_name: string; status: "ok" | "no_match" | "error"; total_mentions: number; recent_headlines: Array<{ title: string; source: string; date: string; url: string }>; error?: string; }; // BLS construction employment — macro context for ALL Chicago permits. // Series SMU17169802000000001 = Construction, all employees, Chicago- // Naperville-Elgin MSA, NSA, monthly. Free public API, no auth. // Cached process-wide (single request per minutes-window) since this // is macroeconomic context — same number applies to every permit shown. export type BlsTrendBrief = { source: "bls.gov"; fetched_at: string; series_id: string; status: "ok" | "error"; latest: { period: string; value: number } | null; yoy_change_pct: number | null; mom_change_pct: number | null; recent: Array<{ period: string; value: number }>; // last 6 months trend: "growing" | "stable" | "declining" | "unknown"; error?: string; }; // News sentiment — basic positive/negative keyword scoring layered over // the existing Google News RSS results. Not LLM-grade but catches the // obvious "lawsuit / fraud / fired / OSHA / debt" signals vs "awarded / // expansion / contract win" signals. Output is a -1..+1 score with // counts of positive/negative-flagged headlines. export type NewsSentiment = { score: number; // -1 to +1 positive: number; negative: number; neutral: number; flagged_headlines: Array<{ title: string; polarity: "pos" | "neg"; reasons: string[] }>; }; // OSHA Severe Violator Enforcement Program (SVEP) — explicit named // list of contractors placed in heightened federal/state OSHA // enforcement after a willful or repeat-violation event. Highest- // signal "this firm has been formally flagged as a worst actor" check. // // Data refreshed quarterly from osha.gov/enforcement/svep XLSX // (Federal + State logs). Pre-parsed to JSON at // /data/_entity_cache/svep_log.json. ~1,044 entries (990 federal + 54 state). // Manual refresh: re-download Public_Federal_SVEP_Tracking_Log.xlsx + // Public_State_SVEP_Log.xlsx, run the Python parser, update JSON. export type SvepBrief = { source: "osha.gov/enforcement/svep"; fetched_at: string; searched_name: string; status: "ok" | "no_match" | "error"; flagged: boolean; matched_entries: Array<{ name: string }>; log_size: number; log_age_days: number; }; let _svepIdx: Array<{ name: string; normalized: string }> | null = null; let _svepMeta: { fetched_at: string; size: number } | null = null; async function ensureSvepLoaded() { if (_svepIdx) return; const path = "/home/profit/lakehouse/data/_entity_cache/svep_log.json"; if (!existsSync(path)) { _svepIdx = []; _svepMeta = { fetched_at: "", size: 0 }; return; } try { const raw = JSON.parse(await readFile(path, "utf-8")); _svepIdx = raw.companies || []; _svepMeta = { fetched_at: raw.fetched_at || "unknown", size: _svepIdx?.length || 0 }; } catch { _svepIdx = []; _svepMeta = { fetched_at: "", size: 0 }; } } export async function fetchSvepBrief(name: string): Promise { const now = new Date().toISOString(); await ensureSvepLoaded(); const idx = _svepIdx || []; const meta = _svepMeta || { fetched_at: "", size: 0 }; // SVEP entries can have multiple aliases joined by "/". Match any token // overlap with our query's identifying-words set. const queryNorm = normalizeEntityName(name); if (!queryNorm) { return { source: "osha.gov/enforcement/svep", fetched_at: now, searched_name: name, status: "no_match", flagged: false, matched_entries: [], log_size: meta.size, log_age_days: 0, }; } const queryWords = new Set( queryNorm.split(" ").filter((w) => w.length >= 4), ); const matches: Array<{ name: string }> = []; for (const e of idx) { // SVEP names use "/" to separate aliases. Check each alias. const aliases = e.name.split("/").map((a) => a.trim()).filter(Boolean); for (const a of aliases) { const aNorm = normalizeEntityName(a); if (!aNorm) continue; // Direct exact normalized match if (aNorm === queryNorm) { matches.push({ name: a }); break; } // Word-overlap: at least 2 shared words ≥ 4 chars const aWords = aNorm.split(" ").filter((w) => w.length >= 4); const overlap = aWords.filter((w) => queryWords.has(w)).length; if (overlap >= 2) { matches.push({ name: a }); break; } } } let logAge = 0; if (meta.fetched_at) { const t = Date.parse(meta.fetched_at); if (!isNaN(t)) logAge = Math.round((Date.now() - t) / 86400000); } return { source: "osha.gov/enforcement/svep", fetched_at: now, searched_name: name, status: matches.length > 0 ? "ok" : "no_match", flagged: matches.length > 0, matched_entries: matches.slice(0, 5), log_size: meta.size, log_age_days: logAge, }; } // OSHA Severe Injury Reports — placeholder, bot-protected. export type OshaSirBrief = { source: "osha.gov/severeinjury"; fetched_at: string; searched_name: string; status: "needs_setup"; reason: string; }; // Cook County Recorder mechanics liens — placeholder, scrape research needed. export type LiensBrief = { source: "cook_county_recorder"; fetched_at: string; searched_address: string; status: "needs_setup"; reason: string; }; // USASpending.gov federal contracts. Free API, no auth. // api.usaspending.gov/api/v2/search/spending_by_award/ // Returns total awarded $, awarding agency, NAICS, subaward count. // Strong signal: a Chicago contractor with $50M of federal work has // scale + compliance posture. Quiet contractors are private/local. export type FederalContractsBrief = { source: "usaspending.gov"; fetched_at: string; searched_name: string; status: "ok" | "no_match" | "error"; total_awards_count: number; total_awards_value: number; most_recent_award_date?: string; top_agencies: Array<{ agency: string; value: number }>; source_url?: string; error?: string; }; // NLRB cases — union actions / unfair labor practice charges. // Free public search at apps.nlrb.gov/case but the JSON endpoint // is undocumented. Need short scraper. For now placeholder. export type NlrbBrief = { source: "nlrb.gov"; fetched_at: string; searched_name: string; status: "ok" | "no_match" | "needs_setup"; reason?: string; total_cases?: number; recent_cases?: Array<{ case_number: string; case_type: string; date: string; status: string }>; }; // Union locals likely to handle each trade. Static lookup — no API. // Static is fine because Chicago union jurisdictions don't change often, // and the granularity here is "show staffers what unions are involved." // We don't claim certainty per project; we surface the typical mapping. export type UnionMapping = { trade: string; primary_locals: Array<{ name: string; local: string; jurisdiction: string }>; // Training centers run by the JATCs (joint apprenticeship) training_centers: Array<{ name: string; address: string; program_length: string }>; }; // Contractor history over the Chicago permit dataset. Cheap signal: // how active has this contractor been historically, and what's their // trajectory in the last 24 months? Helpful for spotting "suddenly // active" (possible shell LLC) or "long-tenured + now declining" // (losing market share). export type ContractorHistory = { source: "chicago_socrata"; fetched_at: string; searched_name: string; status: "ok" | "error"; permits_last_180d: number; permits_last_24mo: number; permits_historical_total: number; total_cost_last_24mo: number; trend: "growing" | "stable" | "declining" | "new" | "unknown"; recent_permits: Array<{ date: string; work_type: string; cost: number; address: string; lat?: number; lng?: number; permit_id?: string; description?: string }>; error?: string; }; export type EntityBrief = { key: string; display_name: string; role: string; ticker: string; osha: OshaBrief | null; ilsos: IlsosBrief | null; stock: TickerBrief | null; history: ContractorHistory | null; debarment: DebarmentBrief | null; parent_link: ParentLinkBrief | null; federal: FederalContractsBrief | null; nlrb: NlrbBrief | null; diversity: DiversityCertBrief | null; news: NewsBrief | null; news_sentiment: NewsSentiment | null; osha_sir: OshaSirBrief | null; svep: SvepBrief | null; risk: { score: number | null; factors: string[]; partial: boolean; }; }; // Project Index Score — auditable weighted aggregation of every signal // we pulled. Each contribution carries its weight, direction (good/bad), // raw value, and a human-readable note so staffers can see exactly why // the project landed where it did. Score ranges 0-100 anchored at 50 // neutral. Bands: red (<30), amber (30-45), neutral (45-55), green // (55-75), strong (75+). export type SignalContribution = { signal: string; // e.g. "osha_recent_inspections" weight: number; // 1-10 importance direction: -1 | 0 | 1; // -1 bad, 0 neutral, +1 good raw: any; // underlying value contribution: number; // signed points moved away from 50 note: string; // explanation }; export type ProjectIndexScore = { score: number; band: "red" | "amber" | "neutral" | "green" | "strong"; contributions: SignalContribution[]; partial: boolean; // true when ILSOS / SAM / NLRB / other key sources unavailable }; export type PermitEntityBrief = { permit_id: string; property: { address: string; ticker: string; owner: PropertyOwnerBrief | null; violations: PropertyViolationsBrief | null; union: UnionMapping | null; site_context: SiteContextBrief | null; liens: LiensBrief | null; }; entities: EntityBrief[]; tickers: TickerBrief[]; // Macroeconomic context — same for all permits served in same window. // Tells the staffer "the broader Chicago construction labor market // is shrinking" or "growing" — context for the per-permit decision. macro: BlsTrendBrief | null; // Aggregate score across all entities + property signals index_score: ProjectIndexScore; roadmap: string[]; generated_at: string; }; // ─── Cache ──────────────────────────────────────────────────────── type CacheKind = | "osha" | "ilsos" | "ticker" | "history" | "owner" | "violations" | "debarment" | "parent" | "federal" | "nlrb" | "site_context" | "diversity" | "news" | "osha_sir" | "liens"; type CacheRow = { key: string; kind: CacheKind; data: any; expires_at: number; }; let cacheMap: Map | null = null; async function ensureCacheLoaded(): Promise> { if (cacheMap) return cacheMap; cacheMap = new Map(); if (!existsSync(CACHE_DIR)) { await mkdir(CACHE_DIR, { recursive: true }); } if (!existsSync(CACHE_FILE)) return cacheMap; try { const text = await readFile(CACHE_FILE, "utf-8"); for (const line of text.split("\n")) { if (!line.trim()) continue; try { const row = JSON.parse(line) as CacheRow; const mk = `${row.kind}:${row.key}`; const prev = cacheMap.get(mk); if (!prev || prev.expires_at < row.expires_at) cacheMap.set(mk, row); } catch { /* skip malformed row */ } } } catch (e) { console.warn("[entity] cache load failed:", (e as Error).message); } return cacheMap; } async function cacheGet(key: string, kind: CacheKind): Promise { const m = await ensureCacheLoaded(); const row = m.get(`${kind}:${key}`); if (!row) return null; if (row.expires_at < Date.now()) return null; return row.data; } async function cacheSet( key: string, kind: CacheKind, data: any, ttl = DEFAULT_TTL_MS, ) { const m = await ensureCacheLoaded(); const row: CacheRow = { key, kind, data, expires_at: Date.now() + ttl }; m.set(`${kind}:${row.key}`, row); try { await writeFile(CACHE_FILE, JSON.stringify(row) + "\n", { flag: "a" }); } catch (e) { console.warn("[entity] cache write failed:", (e as Error).message); } } // ─── Name normalization ───────────────────────────────────────────── // Collapses "ADT,LLC" / "ADT LLC" / "ADT, L.L.C." to a single // canonical key so cache hits don't miss on punctuation drift. export function normalizeEntityName(name: string): string { return (name || "") .toUpperCase() .replace(/[,.\-]/g, " ") .replace(/\b(THE|AND)\b/g, "") .replace(/\b(L\s*L\s*C|L\s*L\s*P|L\s*P|INC|CO|CORP|COMPANY|CORPORATION|LTD|LIMITED|GROUP|HOLDINGS|ENTERPRISES)\b/g, "") .replace(/[^\w ]/g, " ") .replace(/\s+/g, " ") .trim(); } export function entityTicker(name: string): string { const n = normalizeEntityName(name); const parts = n.split(" ").filter(Boolean).slice(0, 3); const short = parts.map((p) => p.slice(0, 6)).join("-").slice(0, 18); return `LLC·${short || "UNKNOWN"}`; } function addressTicker(address: string): string { const digits = (address.match(/\d+/g) || []).join(""); const firstWord = (address.split(/\s+/).find((w) => /^[A-Za-z]/.test(w)) || "BLDG") .toUpperCase() .slice(0, 8); return `BLDG·${firstWord}-${digits.slice(0, 5) || "0000"}`; } // ─── OSHA scraper ────────────────────────────────────────────────── async function fetchOshaHTML(name: string): Promise { await oshaGate(); const u = new URL("https://www.osha.gov/ords/imis/establishment.search"); u.searchParams.set("p_logger", "1"); u.searchParams.set("establishment", name); u.searchParams.set("State", "all"); u.searchParams.set("p_case", "all"); const res = await fetch(u.toString(), { headers: { "User-Agent": UA }, signal: AbortSignal.timeout(15000), }); if (!res.ok) throw new Error(`osha HTTP ${res.status}`); return await res.text(); } // Parse inspection list rows. Uses matchAll so we don't rely on // stateful regex iteration. function parseOshaInspections(html: string): OshaInspection[] { const inspections: OshaInspection[] = []; const rows = [...html.matchAll(/]*>([\s\S]*?)<\/tr>/gi)]; for (const rm of rows) { const row = rm[1]; if (!row.includes("inspection_detail")) continue; const tds = [...row.matchAll(/]*>([\s\S]*?)<\/td>/gi)].map((t) => t[1].replace(/<[^>]+>/g, "").replace(/ /g, "").trim(), ); const idMatch = row.match(/establishment\.inspection_detail\?id=([\d.]+)/); if (!idMatch) continue; const id = idMatch[1]; inspections.push({ id, date: tds[3] || "", state: tds[5] || "", type: tds[6] || "", scope: tds[7] || "", naics: tds[9] || "", establishment: tds[tds.length - 1] || "", detail_url: `https://www.osha.gov/ords/imis/establishment.inspection_detail?id=${id}`, }); } return inspections; } export async function fetchOshaBrief(name: string, useCache = true): Promise { const normalized = normalizeEntityName(name); const now = new Date().toISOString(); if (!normalized) { return { source: "osha", fetched_at: now, searched_name: name, normalized_name: "", source_url: "", status: "no_match", inspection_count: 0, most_recent_date: null, recent_inspections: [], states_seen: [], }; } if (useCache) { const hit = await cacheGet(normalized, "osha"); if (hit) return hit; } const u = `https://www.osha.gov/ords/imis/establishment.search?p_logger=1&establishment=${encodeURIComponent(name)}&State=all&p_case=all`; try { const html = await fetchOshaHTML(name); const inspections = parseOshaInspections(html); inspections.sort((a, b) => { const pa = Date.parse(a.date) || 0; const pb = Date.parse(b.date) || 0; return pb - pa; }); const states_seen = [...new Set(inspections.map((i) => i.state).filter(Boolean))]; const brief: OshaBrief = { source: "osha", fetched_at: now, searched_name: name, normalized_name: normalized, source_url: u, status: inspections.length > 0 ? "ok" : "no_match", inspection_count: inspections.length, most_recent_date: inspections[0]?.date ?? null, recent_inspections: inspections.slice(0, 5), states_seen, }; await cacheSet(normalized, "osha", brief); return brief; } catch (e) { return { source: "osha", fetched_at: now, searched_name: name, normalized_name: normalized, source_url: u, status: "error", inspection_count: 0, most_recent_date: null, recent_inspections: [], states_seen: [], error: (e as Error).message, }; } } // ─── ILSOS placeholder ──────────────────────────────────────────── export async function fetchIlsosBrief(name: string): Promise { const normalized = normalizeEntityName(name); const cached = await cacheGet(normalized, "ilsos"); if (cached) return cached; const brief: IlsosBrief = { source: "ilsos", fetched_at: new Date().toISOString(), searched_name: name, status: "source_unreachable", reason: "Illinois SoS apps.ilsos.gov blocks our datacenter ASN. Pending: VPN-routed fetch, OpenCorporates API token (~200/mo free), or paid IL bulk feed.", }; return brief; } // ─── SEC EDGAR ticker lookup ────────────────────────────────────── // The canonical free source. A single JSON file at sec.gov/files/ // company_tickers.json maps every US-listed ticker → CIK → company // name (~10K issuers). We fetch it at most once per 24h and keep it // in memory + on disk. const SEC_TICKERS_URL = "https://www.sec.gov/files/company_tickers.json"; const SEC_TICKERS_CACHE = join(CACHE_DIR, "sec_company_tickers.json"); const SEC_TICKERS_TTL_MS = 24 * 60 * 60 * 1000; type SecTickerRow = { cik_str: number; ticker: string; title: string }; let secTickersIdx: { rows: SecTickerRow[]; byNormalized: Map; refreshed_at: number; } | null = null; async function ensureSecTickerIndex(): Promise { const fresh = secTickersIdx && Date.now() - secTickersIdx.refreshed_at < SEC_TICKERS_TTL_MS; if (fresh) return secTickersIdx; let raw: Record | null = null; // Disk first if (existsSync(SEC_TICKERS_CACHE)) { try { const st = await readFile(SEC_TICKERS_CACHE, "utf-8"); const parsed = JSON.parse(st); if (parsed?._fetched_at && Date.now() - parsed._fetched_at < SEC_TICKERS_TTL_MS) { raw = parsed.rows; } } catch {/* fall through */} } // Refresh from SEC if (!raw) { try { const res = await fetch(SEC_TICKERS_URL, { headers: { "User-Agent": UA, "Accept": "application/json" }, signal: AbortSignal.timeout(20000), }); if (res.ok) { raw = await res.json(); await writeFile( SEC_TICKERS_CACHE, JSON.stringify({ _fetched_at: Date.now(), rows: raw }), ); } } catch (e) { console.warn("[entity] sec tickers fetch failed:", (e as Error).message); } } if (!raw) return secTickersIdx; // may still be null const rows = Object.values(raw).filter((r) => r && r.ticker && r.title); const byNormalized = new Map(); for (const r of rows) { const k = normalizeEntityName(r.title); if (!k) continue; const arr = byNormalized.get(k) || []; arr.push(r); byNormalized.set(k, arr); } secTickersIdx = { rows, byNormalized, refreshed_at: Date.now() }; return secTickersIdx; } // Name → ticker resolution. Tries exact-normalized first, then // word-overlap match. Hard rule: at least one shared word of 4+ chars // between the searched name and the candidate title. Prevents spurious // hits like "POREMBA DENISE" matching "NI Holdings" because "DENISE" // contains "NI". Also skips names that look like individual persons // (two capitalized words, no entity suffix) — they're never issuers. async function resolveSecTicker(name: string): Promise { const idx = await ensureSecTickerIndex(); if (!idx) return null; const key = normalizeEntityName(name); if (!key) return null; // Skip obvious individual names — "LAST, FIRST M" or "FIRST LAST". // Two signals: // (1) comma in name → "LAST, FIRST" form, human. // (2) exactly 2 word-tokens AND no entity marker → "FIRST LAST" // form. A company almost always has ≥3 tokens (X Y Inc) or // an entity marker; two-word entities like "APPLE INC" get // caught by the marker, not by looksIndividual. const upper = name.toUpperCase(); const hasEntityMarker = /\b(LLC|L\s*L\s*C|LLP|INC|INC\.|CORP|CORPORATION|COMPANY|CO|CO\.|LTD|LIMITED|GROUP|HOLDINGS|ENTERPRISES|AUTHORITY|ASSOCIATION|TRUST|BANK|PARTNERS|INDUSTRIES|CONSTRUCTION|ELECTRIC|MECHANICAL|CONTRACTING|PROPERTIES|REALTY|DEVELOPMENT|ENGINEERING|PLUMBING|ROOFING|SERVICES|SOLUTIONS|SYSTEMS|TECHNOLOGIES|LOGISTICS|RESOURCES)\b/.test( upper, ); const tokens = upper.split(/\s+/).filter(Boolean); const hasComma = /,/.test(name); const looksIndividual = (hasComma && !hasEntityMarker) || (tokens.length === 2 && !hasEntityMarker); if (looksIndividual) return null; // Skip obvious government / municipal entities — they're not US public // issuers. If they were, their bond funds would surface separately. const isGovernment = /\b(AUTHORITY|MUNICIPAL|CITY OF|COUNTY|DISTRICT|BOARD OF EDUCATION|PUBLIC SCHOOLS|TRANSIT|HOUSING AUTHORITY|DEPARTMENT OF|BUREAU OF|PARK DISTRICT|WATER RECLAMATION)\b/i.test( name, ); if (isGovernment) return null; // Exact normalized match — most reliable path. const exact = idx.byNormalized.get(key); if (exact && exact.length) return exact.sort((a, b) => a.ticker.length - b.ticker.length)[0]; // Word-overlap with stopwords filtered. "CONSTRUCTION", "COMPANY", // "SERVICES" etc. are too generic to identify an issuer on their own // — shared presence is meaningless. Require overlap on identifying // words only. const STOPWORDS = new Set([ "CONSTRUCTION","COMPANY","SERVICES","SERVICE","GROUP","SYSTEMS", "SOLUTIONS","RESOURCES","TECHNOLOGIES","INDUSTRIES","PARTNERS", "ELECTRIC","MECHANICAL","PLUMBING","ROOFING","DEVELOPMENT","REALTY", "PROPERTIES","ENGINEERING","ASSOCIATES","INTERNATIONAL","NATIONAL", "AMERICAN","GENERAL","AUTHORITY","HOLDINGS","ENTERPRISES","LOGISTICS", "COMMUNICATIONS","FINANCIAL","CAPITAL","PROPERTY","RESIDENTIAL", "COMMERCIAL","GLOBAL","UNITED","STATES","STATE","FEDERAL", "MANAGEMENT","CONTRACTING","CONTRACTORS","BUILDERS","BUILDING", ]); const identifyingWords = (s: string) => new Set( s .split(" ") .filter((w) => w.length >= 4 && !STOPWORDS.has(w)), ); const keyId = identifyingWords(key); if (keyId.size === 0) return null; // no non-stopword words to match on const candidates: { row: SecTickerRow; overlap: number; k2: string }[] = []; for (const r of idx.rows) { const k2 = normalizeEntityName(r.title); if (!k2) continue; const k2Id = identifyingWords(k2); let overlap = 0; for (const w of k2Id) { if (keyId.has(w)) overlap++; } // Require overlap ≥ 1 AND that overlap covers ≥ 50% of the shorter // identifying-word set. Prevents "TARGET HOSPITALITY" from matching // "TARGET CORP" on just the shared word "TARGET" when the sets // are 1-vs-1; forces them to agree on substance. // Fuzzy path requires overlap ≥ 2 non-stopword words. Single-word // matches are only valid via the exact-normalized path above. // Otherwise "POWER" (from POWER CONSTRUCTION) falsely matches // POWER INTEGRATIONS and CAPITAL POWER CORP. if (overlap < 2) continue; const minIdSize = Math.min(keyId.size, k2Id.size); if (overlap / minIdSize < 0.5) continue; candidates.push({ row: r, overlap, k2 }); } if (!candidates.length) return null; candidates.sort((a, b) => { if (b.overlap !== a.overlap) return b.overlap - a.overlap; const la = Math.abs(a.k2.length - key.length); const lb = Math.abs(b.k2.length - key.length); if (la !== lb) return la - lb; return a.row.ticker.length - b.row.ticker.length; }); return candidates[0].row; } // Pull SIC + exchange for a CIK from data.sec.gov/submissions async function fetchSecProfile(cik: number | string): Promise<{ exchange?: string; sic?: string; sic_description?: string; }> { const padded = String(cik).padStart(10, "0"); try { const res = await fetch(`https://data.sec.gov/submissions/CIK${padded}.json`, { headers: { "User-Agent": UA }, signal: AbortSignal.timeout(10000), }); if (!res.ok) return {}; const j = await res.json(); return { exchange: Array.isArray(j.exchanges) ? j.exchanges[0] : undefined, sic: j.sic, sic_description: j.sicDescription, }; } catch { return {}; } } // Stooq — free daily quote CSV. Format (column order from the response): // Symbol,Date,Time,Open,High,Low,Close,Volume export async function fetchStooqQuote(ticker: string): Promise<{ price?: number; price_date?: string; open?: number; high?: number; low?: number; volume?: number; } | null> { try { const res = await fetch( `https://stooq.com/q/l/?s=${encodeURIComponent(ticker.toLowerCase())}.us&f=sd2t2ohlcv&h&e=csv`, { headers: { "User-Agent": UA }, signal: AbortSignal.timeout(8000) }, ); if (!res.ok) return null; const csv = await res.text(); const lines = csv.trim().split("\n"); if (lines.length < 2) return null; const cols = lines[1].split(","); // Stooq returns "N/D" literally for unknown symbols if (cols.includes("N/D")) return null; const [sym, date, _t, open, high, low, close, volume] = cols; return { price: parseFloat(close) || undefined, price_date: date, open: parseFloat(open) || undefined, high: parseFloat(high) || undefined, low: parseFloat(low) || undefined, volume: parseInt(volume, 10) || undefined, }; } catch { return null; } } // Cheap name-to-ticker lookup for batch use (e.g. profiler index over // 200 contractors). Hits the in-memory SEC tickers index and the // KNOWN_PARENT_MAP. No SEC profile fetch, no Stooq fetch. Returns // every ticker we can attribute to a name, with a tag describing the // link (direct = name matches a public issuer, parent = the company is // a subsidiary with a public parent in our curated map). export type LiteTicker = { ticker: string; via: "direct" | "parent" | "exact"; matched_name?: string; exchange?: string; reason?: string; }; export async function lookupTickerLite(name: string): Promise { const out: LiteTicker[] = []; // Direct SEC match (in-memory) const direct = await resolveSecTicker(name); if (direct) { out.push({ ticker: direct.ticker, via: normalizeEntityName(direct.title) === normalizeEntityName(name) ? "exact" : "direct", matched_name: direct.title, }); } // Parent map (curated) const key = normalizeEntityName(name); for (const [k, v] of Object.entries(KNOWN_PARENT_MAP)) { if (key && key.includes(normalizeEntityName(k)) && v.status === "ok" && v.parent_ticker) { out.push({ ticker: v.parent_ticker, via: "parent", matched_name: v.parent_name, exchange: v.parent_exchange, reason: v.link_source, }); } } return out; } export async function fetchTickerBrief(name: string): Promise { const now = new Date().toISOString(); const cached = await cacheGet(normalizeEntityName(name), "ticker"); if (cached) return cached; const hit = await resolveSecTicker(name); if (!hit) { const b: TickerBrief = { source: "sec+stooq", fetched_at: now, searched_name: name, status: "no_match", }; // 7-day TTL on no_match — company could become public await cacheSet(normalizeEntityName(name), "ticker", b, 7 * 24 * 60 * 60 * 1000); return b; } const [profile, quote] = await Promise.all([ fetchSecProfile(hit.cik_str), fetchStooqQuote(hit.ticker), ]); const cap_proxy = quote?.price && quote?.volume ? quote.price * quote.volume : undefined; const day_change_pct = quote?.price && quote?.open && quote.open > 0 ? ((quote.price - quote.open) / quote.open) * 100 : undefined; const brief: TickerBrief = { source: "sec+stooq", fetched_at: now, searched_name: name, status: "ok", ticker: hit.ticker, company_name: hit.title, cik: String(hit.cik_str), exchange: profile.exchange, sic: profile.sic, sic_description: profile.sic_description, price: quote?.price, price_date: quote?.price_date, open: quote?.open, high: quote?.high, low: quote?.low, volume: quote?.volume, day_change_pct, cap_proxy, sec_url: `https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=${String(hit.cik_str).padStart(10, "0")}`, stooq_url: `https://stooq.com/q/?s=${hit.ticker.toLowerCase()}.us`, }; // 6-hour TTL on ok hits — prices go stale, but SEC metadata doesn't. await cacheSet(normalizeEntityName(name), "ticker", brief, 6 * 60 * 60 * 1000); return brief; } // ─── Contractor history ──────────────────────────────────────────── // Queries the same Chicago Socrata permits dataset for prior activity. // Two time windows: last 180 days (recent velocity) and last 24 months // (longer-run baseline). All-time count gives tenure. Trend classifier // compares 180d rate to 24mo rate. export async function fetchContractorHistory(name: string): Promise { const now = new Date().toISOString(); const cached = await cacheGet(normalizeEntityName(name), "history"); if (cached) return cached; const today = new Date(); const d180 = new Date(today.getTime() - 180 * 86400000).toISOString().slice(0, 10); const d24mo = new Date(today.getTime() - 730 * 86400000).toISOString().slice(0, 10); const base = "https://data.cityofchicago.org/resource/ydr8-5enu.json"; // Escape single quotes in contractor name per Socrata rules. // Socrata LIKE with leading % prefix is substring — we need this // because "TURNER CONSTRUCTION" on a new permit should match against // "TURNER CONSTRUCTION COMPANY" on historical ones. Guard against // false-positive hits on very short names (< 6 chars). const safeName = name.replace(/'/g, "''"); const match = name.length >= 6 ? `upper(contact_1_name) LIKE upper('%${safeName}%')` : `contact_1_name='${safeName}'`; const where180 = encodeURIComponent(`${match} AND issue_date>'${d180}'`); const where24mo = encodeURIComponent(`${match} AND issue_date>'${d24mo}'`); const whereAll = encodeURIComponent(match); try { const [c180, c24, cAll, recent] = await Promise.all([ fetch(`${base}?$select=count(*)&$where=${where180}`).then((r) => r.json()), fetch(`${base}?$select=count(*),sum(reported_cost)&$where=${where24mo}`).then((r) => r.json()), fetch(`${base}?$select=count(*)&$where=${whereAll}`).then((r) => r.json()), fetch( // Pulled to 50 rows + lat/lng + permit id + description so the // contractor profile heat map and timeline have something to plot. `${base}?$select=id,issue_date,work_type,reported_cost,street_number,street_direction,street_name,latitude,longitude,work_description&$where=${whereAll}&$order=issue_date DESC&$limit=50`, ).then((r) => r.json()), ]); const n180 = parseInt(c180?.[0]?.count || "0", 10); const n24 = parseInt(c24?.[0]?.count || "0", 10); const cost24 = parseFloat(c24?.[0]?.sum_reported_cost || "0"); const nAll = parseInt(cAll?.[0]?.count || "0", 10); // Trend: compare 180d rate to prior 18mo rate. // 180d = half a year; 24mo = 8 half-years. Prior half-years = (n24-n180)/7. let trend: ContractorHistory["trend"] = "unknown"; if (nAll === 0) trend = "unknown"; else if (n24 <= 1 && nAll <= 3) trend = "new"; else { const priorRate = (n24 - n180) / 7; // permits per 180d across prior 18mo if (priorRate === 0 && n180 > 0) trend = "growing"; else if (priorRate > 0) { const ratio = n180 / priorRate; if (ratio > 1.5) trend = "growing"; else if (ratio < 0.5) trend = "declining"; else trend = "stable"; } else trend = "stable"; } const brief: ContractorHistory = { source: "chicago_socrata", fetched_at: now, searched_name: name, status: "ok", permits_last_180d: n180, permits_last_24mo: n24, permits_historical_total: nAll, total_cost_last_24mo: cost24, trend, recent_permits: (recent || []).map((r: any) => ({ date: (r.issue_date || "").slice(0, 10), work_type: r.work_type || "", cost: parseFloat(r.reported_cost || 0), address: `${r.street_number || ""} ${r.street_direction || ""} ${r.street_name || ""}`.trim().replace(/\s+/g, " "), lat: r.latitude ? parseFloat(r.latitude) : undefined, lng: r.longitude ? parseFloat(r.longitude) : undefined, permit_id: r.id || undefined, description: r.work_description || undefined, })), }; // 12-hour TTL — activity rarely changes more than once/day await cacheSet(normalizeEntityName(name), "history", brief, 12 * 60 * 60 * 1000); return brief; } catch (e) { return { source: "chicago_socrata", fetched_at: now, searched_name: name, status: "error", permits_last_180d: 0, permits_last_24mo: 0, permits_historical_total: 0, total_cost_last_24mo: 0, trend: "unknown", recent_permits: [], error: (e as Error).message, }; } } // ─── Cook County Assessor (property owner) ───────────────────────── // Free Socrata at datacatalog.cookcountyil.gov/resource/c49d-89sn.json. // Address normalization is gentle: keep number + direction + name; drop // suite/unit. Match using LIKE prefix because addresses on building // permits are often missing trailing details ("4809 N BROADWAY" vs // "4809 N BROADWAY UNIT 5" in the parcel record). export async function fetchPropertyOwner(address: string): Promise { const now = new Date().toISOString(); const cleaned = address.trim().replace(/\s+/g, " ").toUpperCase(); const cached = await cacheGet(cleaned, "owner"); if (cached) return cached; if (!cleaned || cleaned.length < 4) { return { source: "cook_county_assessor", fetched_at: now, searched_address: address, status: "no_match", }; } const safe = cleaned.replace(/'/g, "''"); // Match the full address text as a prefix; LIKE upper(...) for case fold. const where = encodeURIComponent( `upper(property_address) like upper('${safe}%') AND property_city='CHICAGO'`, ); const sourceUrl = `https://datacatalog.cookcountyil.gov/resource/c49d-89sn.json?$where=${where}&$limit=1`; try { const res = await fetch(sourceUrl, { headers: { "User-Agent": UA }, signal: AbortSignal.timeout(10000), }); if (!res.ok) throw new Error(`assessor HTTP ${res.status}`); const rows = (await res.json()) as any[]; if (!rows || rows.length === 0) { const empty: PropertyOwnerBrief = { source: "cook_county_assessor", fetched_at: now, searched_address: address, status: "no_match", source_url: sourceUrl, }; await cacheSet(cleaned, "owner", empty, 7 * 24 * 60 * 60 * 1000); return empty; } const r = rows[0]; const brief: PropertyOwnerBrief = { source: "cook_county_assessor", fetched_at: now, searched_address: address, status: "ok", pin: r.pin, property_address: r.property_address, property_zip: r.property_zip, mailing_address: r.mailing_address, mailing_city: r.mailing_city, mailing_state: r.mailing_state, mailing_zip: r.mailing_zip, township: r.township_name, ward: r.ward, longitude: r.longitude, latitude: r.latitude, source_url: sourceUrl, }; // Owner records change rarely — 30d cache. await cacheSet(cleaned, "owner", brief); return brief; } catch (e) { return { source: "cook_county_assessor", fetched_at: now, searched_address: address, status: "error", error: (e as Error).message, }; } } // ─── Chicago Building Violations ─────────────────────────────────── // Dataset 22u3-xenr. Address-keyed. Status field tells us which are // open vs complied. Stop-work orders surface in violation_description. export async function fetchPropertyViolations(address: string): Promise { const now = new Date().toISOString(); const cleaned = address.trim().replace(/\s+/g, " ").toUpperCase(); const cached = await cacheGet(cleaned, "violations"); if (cached) return cached; if (!cleaned || cleaned.length < 6) { return { source: "chicago_building_violations", fetched_at: now, searched_address: address, status: "error", total_violations: 0, open_violations: 0, stop_work_orders: 0, recent_violations: [], error: "address too short", }; } const safe = cleaned.replace(/'/g, "''"); const where = encodeURIComponent(`upper(address) like upper('${safe}%')`); const url = `https://data.cityofchicago.org/resource/22u3-xenr.json?$where=${where}&$order=violation_date DESC&$limit=50`; try { const res = await fetch(url, { headers: { "User-Agent": UA }, signal: AbortSignal.timeout(10000), }); if (!res.ok) throw new Error(`violations HTTP ${res.status}`); const rows = (await res.json()) as any[]; const total = rows.length; const open = rows.filter( (r) => /OPEN|FAILED|NO ENTRY/i.test(r.violation_status || ""), ).length; const stopWork = rows.filter((r) => /STOP[ -]?WORK/i.test(`${r.violation_description || ""} ${r.violation_inspector_comments || ""}`), ).length; const recent = rows.slice(0, 5).map((r) => ({ date: (r.violation_date || "").slice(0, 10), status: r.violation_status || "", description: (r.violation_description || "").slice(0, 120), department: r.department_bureau || "", })); const brief: PropertyViolationsBrief = { source: "chicago_building_violations", fetched_at: now, searched_address: address, status: "ok", total_violations: total, open_violations: open, stop_work_orders: stopWork, most_recent_date: rows[0]?.violation_date?.slice(0, 10), recent_violations: recent, }; await cacheSet(cleaned, "violations", brief, 12 * 60 * 60 * 1000); return brief; } catch (e) { return { source: "chicago_building_violations", fetched_at: now, searched_address: address, status: "error", total_violations: 0, open_violations: 0, stop_work_orders: 0, recent_violations: [], error: (e as Error).message, }; } } // ─── Union lookup (static) ───────────────────────────────────────── // Map permit work_type → likely Chicago Local(s) + their training // centers (JATCs). Surfaces "every union that handles this contract" // per J's request. Static because Chicago union jurisdictions are // stable and the granularity is "typical assignment, not certainty // per project." const UNION_MAP: Record = { ELECTRICAL: { trade: "Electrical", primary_locals: [ { name: "IBEW Local 134", local: "134", jurisdiction: "Inside electrical (Chicago)" }, { name: "IBEW Local 701", local: "701", jurisdiction: "Low-voltage / suburban" }, ], training_centers: [ { name: "IBEW 134 / NECA JATC", address: "6301 W 115th St, Alsip IL", program_length: "5-yr apprenticeship" }, { name: "IBEW 701 / NECA JATC", address: "1979 Bucktail Ln, Sugar Grove IL", program_length: "5-yr apprenticeship" }, ], }, PLUMBING: { trade: "Plumbing", primary_locals: [ { name: "Plumbers Local 130", local: "130", jurisdiction: "Chicago + Cook County" }, ], training_centers: [ { name: "Plumbers 130 JAC", address: "1340 W Washington Blvd, Chicago", program_length: "5-yr apprenticeship" }, ], }, MECHANICAL: { trade: "Mechanical / HVAC / Pipe", primary_locals: [ { name: "Pipe Fitters Local 597", local: "597", jurisdiction: "HVAC + process piping" }, { name: "Sheet Metal Workers Local 73", local: "73", jurisdiction: "Sheet metal / ductwork" }, { name: "IUOE Local 399", local: "399", jurisdiction: "Stationary engineers (boilers, chillers)" }, ], training_centers: [ { name: "Local 597 Training Center", address: "10806 W 47th St, McCook IL", program_length: "5-yr" }, { name: "Sheet Metal 73 JAC", address: "16410 S 84th Ave, Tinley Park IL", program_length: "5-yr" }, ], }, REROOFING: { trade: "Roofing", primary_locals: [ { name: "Roofers Local 11", local: "11", jurisdiction: "Chicago + suburbs" }, ], training_centers: [ { name: "Roofers Local 11 Training", address: "9525 S Industrial Dr, Bridgeview IL", program_length: "3-yr" }, ], }, MASONRY: { trade: "Masonry / Concrete", primary_locals: [ { name: "Bricklayers Local 21", local: "21", jurisdiction: "Brick + stone" }, { name: "Cement Masons Local 502", local: "502", jurisdiction: "Concrete finishing" }, ], training_centers: [ { name: "Bricklayers 21 Training", address: "11244 S Cottage Grove, Chicago", program_length: "3-yr" }, { name: "Cement Masons 502 JATC", address: "739 25th Ave, Bellwood IL", program_length: "3-yr" }, ], }, GENERAL: { trade: "General construction (multi-trade)", primary_locals: [ { name: "Chicago Regional Council of Carpenters", local: "1/13/58/1051", jurisdiction: "Carpentry + drywall" }, { name: "LIUNA Locals 1/2/4/6/76/96/149/152/225/269/296/1001", local: "various", jurisdiction: "Laborers / earthwork" }, { name: "Operating Engineers Local 150", local: "150", jurisdiction: "Heavy equipment (cranes, excavators)" }, { name: "Ironworkers Local 1/111/136/395", local: "1/111/136/395", jurisdiction: "Structural / rebar" }, { name: "Teamsters Local 731/705", local: "731/705", jurisdiction: "Material haul" }, ], training_centers: [ { name: "Chicago Regional Carpenters Training Center", address: "11001 W Roosevelt, Westchester IL", program_length: "4-yr" }, { name: "Laborers Training School", address: "3636 W Pershing Rd, Chicago", program_length: "3-yr" }, { name: "Operating Engineers 150 Training", address: "16125 W 100th St, Wilmington IL", program_length: "3-yr" }, { name: "Iron Workers Local 1 Apprenticeship", address: "5775 W 26th St, Cicero IL", program_length: "3-yr" }, ], }, WRECKING: { trade: "Wrecking / Demolition", primary_locals: [ { name: "LIUNA Locals (laborers)", local: "1/2/4/...", jurisdiction: "Manual demolition" }, { name: "Operating Engineers Local 150", local: "150", jurisdiction: "Heavy equipment demo" }, { name: "Teamsters Local 731", local: "731", jurisdiction: "Debris haul" }, ], training_centers: [ { name: "Laborers Training School (HazMat track)", address: "3636 W Pershing Rd, Chicago", program_length: "3-yr" }, ], }, SIGN: { trade: "Signs / Signage", primary_locals: [ { name: "IBEW Local 134 (electric signs)", local: "134", jurisdiction: "Lit signage" }, { name: "Painters DC 14 Local 30 (sign painters)", local: "30", jurisdiction: "Hand-painted / vinyl" }, ], training_centers: [ { name: "Painters DC 14 Training", address: "1456 S La Salle, Chicago", program_length: "3-yr" }, ], }, }; export function unionsForWorkType(workType: string | undefined): UnionMapping | null { if (!workType) return null; const w = workType.toUpperCase(); if (/ELECTRIC/.test(w)) return UNION_MAP.ELECTRICAL; if (/PLUMB/.test(w)) return UNION_MAP.PLUMBING; if (/MECHANIC|HVAC|REFRIGERAT|VENT|HEAT/.test(w)) return UNION_MAP.MECHANICAL; if (/ROOF/.test(w)) return UNION_MAP.REROOFING; if (/MASON|CONCRETE|BRICK|CEMENT/.test(w)) return UNION_MAP.MASONRY; if (/WRECK|DEMO/.test(w)) return UNION_MAP.WRECKING; if (/SIGN/.test(w)) return UNION_MAP.SIGN; return UNION_MAP.GENERAL; } // ─── USASpending.gov federal contracts ───────────────────────────── // api.usaspending.gov/api/v2/search/spending_by_award/ — POST with // JSON body. Returns prime award records by recipient name. // Free, no auth. Slow on first call (~3-5s) but cacheable 24h. export async function fetchFederalContracts(name: string): Promise { const now = new Date().toISOString(); const cached = await cacheGet(normalizeEntityName(name), "federal"); if (cached) return cached; if (!name || name.length < 5) { return { source: "usaspending.gov", fetched_at: now, searched_name: name, status: "no_match", total_awards_count: 0, total_awards_value: 0, top_agencies: [], }; } const url = "https://api.usaspending.gov/api/v2/search/spending_by_award/"; const body = { filters: { award_type_codes: ["A", "B", "C", "D"], // contract types recipient_search_text: [name], time_period: [ { start_date: "2020-01-01", end_date: new Date().toISOString().slice(0, 10) }, ], }, fields: [ "Award ID", "Recipient Name", "Award Amount", "Awarding Agency", "Action Date", "NAICS", ], sort: "Award Amount", order: "desc", limit: 25, page: 1, }; try { const res = await fetch(url, { method: "POST", headers: { "Content-Type": "application/json", "User-Agent": UA }, body: JSON.stringify(body), signal: AbortSignal.timeout(15000), }); if (!res.ok) throw new Error(`usaspending HTTP ${res.status}`); const j = (await res.json()) as any; const results = j.results || []; if (results.length === 0) { const empty: FederalContractsBrief = { source: "usaspending.gov", fetched_at: now, searched_name: name, status: "no_match", total_awards_count: 0, total_awards_value: 0, top_agencies: [], source_url: `https://www.usaspending.gov/search/?keywords=${encodeURIComponent(name)}`, }; await cacheSet(normalizeEntityName(name), "federal", empty, 7 * 24 * 60 * 60 * 1000); return empty; } const totalValue = results.reduce( (a: number, r: any) => a + (parseFloat(r["Award Amount"]) || 0), 0, ); const agencyMap: Record = {}; for (const r of results) { const a = r["Awarding Agency"] || "Unknown"; agencyMap[a] = (agencyMap[a] || 0) + (parseFloat(r["Award Amount"]) || 0); } const top = Object.entries(agencyMap) .map(([agency, value]) => ({ agency, value })) .sort((a, b) => b.value - a.value) .slice(0, 3); const recent = results .map((r: any) => r["Action Date"]) .filter(Boolean) .sort() .reverse()[0]; const brief: FederalContractsBrief = { source: "usaspending.gov", fetched_at: now, searched_name: name, status: "ok", total_awards_count: results.length, total_awards_value: totalValue, most_recent_award_date: recent, top_agencies: top, source_url: `https://www.usaspending.gov/search/?keywords=${encodeURIComponent(name)}`, }; await cacheSet(normalizeEntityName(name), "federal", brief, 24 * 60 * 60 * 1000); return brief; } catch (e) { return { source: "usaspending.gov", fetched_at: now, searched_name: name, status: "error", total_awards_count: 0, total_awards_value: 0, top_agencies: [], error: (e as Error).message, }; } } // ─── NLRB cases (placeholder) ────────────────────────────────────── // nlrb.gov/search/case has JSON results behind their JS app. The path // requires session cookies + form tokens — needs a short scraper. // Returning placeholder until next batch. export async function fetchNlrbBrief(name: string): Promise { return { source: "nlrb.gov", fetched_at: new Date().toISOString(), searched_name: name, status: "needs_setup", reason: "NLRB case search at apps.nlrb.gov/case requires session-aware scraping; queued for batch 5b.", }; } // ─── Risk scoring ────────────────────────────────────────────────── function scoreEntity( osha: OshaBrief | null, ilsos: IlsosBrief | null, history: ContractorHistory | null, ) { const factors: string[] = []; let score: number | null = null; let partial = false; if (osha) { if (osha.status === "ok") { const n = osha.inspection_count; if (n === 0) { score = 10; factors.push("osha_clean:no_inspections"); } else if (n < 3) { score = 25; factors.push(`osha_low:${n}_inspections`); } else if (n < 10) { score = 50; factors.push(`osha_moderate:${n}_inspections`); } else { score = 75; factors.push(`osha_high:${n}_inspections`); } if (osha.most_recent_date) { const ageDays = (Date.now() - (Date.parse(osha.most_recent_date) || 0)) / (1000 * 60 * 60 * 24); if (ageDays < 180) { factors.push("osha_recent_activity:<180d"); score = Math.min(100, (score ?? 0) + 15); } } } else if (osha.status === "no_match") { factors.push("osha_no_match"); } else { factors.push("osha_error:" + (osha.error || "unknown")); partial = true; } } else { partial = true; } if (!ilsos || ilsos.status !== "ok") { partial = true; factors.push("ilsos_pending"); } // Brand-new LLCs with a single permit and zero tenure are a // classic LLC-shuffle signature. Flag independently of OSHA. if (history?.status === "ok") { if (history.trend === "new") { factors.push("new_entity:<=3_permits_ever"); score = Math.max(score ?? 0, 35); } else if (history.trend === "growing") { factors.push(`activity_growing:${history.permits_last_180d}_in_180d`); } else if (history.trend === "declining") { factors.push(`activity_declining:${history.permits_last_180d}_in_180d`); } } return { score, factors, partial }; } // ─── CTA L stations (static, in-memory) ──────────────────────────── // Pulled once from Chicago Socrata 8pix-ypme. ~150 stations, rarely // changes. Compute great-circle distance to nearest with haversine. type CtaStation = { station_name: string; lat: number; lon: number; lines: string[]; }; let _ctaStations: CtaStation[] | null = null; async function ensureCtaLoaded(): Promise { if (_ctaStations) return _ctaStations; try { const res = await fetch( "https://data.cityofchicago.org/resource/8pix-ypme.json?$limit=400", { headers: { "User-Agent": UA }, signal: AbortSignal.timeout(15000) }, ); if (!res.ok) throw new Error(`cta HTTP ${res.status}`); const rows = (await res.json()) as any[]; // De-dupe by station_name keeping any one stop's coordinates const byStation = new Map(); for (const r of rows) { const loc = r.location || {}; const lat = parseFloat(loc.latitude || ""); const lon = parseFloat(loc.longitude || ""); if (isNaN(lat) || isNaN(lon)) continue; const station = r.station_name || r.stop_name; if (!station) continue; if (byStation.has(station)) continue; const lines: string[] = []; if (r.red) lines.push("Red"); if (r.blue) lines.push("Blue"); if (r.g) lines.push("Green"); if (r.brn) lines.push("Brown"); if (r.p) lines.push("Purple"); if (r.y) lines.push("Yellow"); if (r.pnk) lines.push("Pink"); if (r.o) lines.push("Orange"); byStation.set(station, { station_name: station, lat, lon, lines }); } _ctaStations = [...byStation.values()]; return _ctaStations; } catch (e) { console.warn("[cta] load failed:", (e as Error).message); _ctaStations = []; return _ctaStations; } } function haversineMeters(lat1: number, lon1: number, lat2: number, lon2: number): number { const R = 6371000; const toRad = (d: number) => (d * Math.PI) / 180; const dLat = toRad(lat2 - lat1); const dLon = toRad(lon2 - lon1); const a = Math.sin(dLat / 2) ** 2 + Math.cos(toRad(lat1)) * Math.cos(toRad(lat2)) * Math.sin(dLon / 2) ** 2; return 2 * R * Math.asin(Math.sqrt(a)); } async function nearestCtaStation(lat: number, lon: number) { const stations = await ensureCtaLoaded(); if (stations.length === 0) return null; let best: { station: CtaStation; dist: number } | null = null; for (const s of stations) { const d = haversineMeters(lat, lon, s.lat, s.lon); if (!best || d < best.dist) best = { station: s, dist: d }; } return best; } // ─── Nearby permits (labor competition signal) ───────────────────── // Socrata supports within_circle(location, lat, lon, radius_meters) // for geo queries. 800m ≈ 0.5mi. Filter to permits issued within the // last 90d so we capture *current* competition only. async function fetchNearbyPermits(lat: number, lon: number) { const since = new Date(Date.now() - 90 * 86400000).toISOString().slice(0, 10); const where = encodeURIComponent( `within_circle(location, ${lat}, ${lon}, 800) AND issue_date>'${since}' AND reported_cost>50000`, ); try { const res = await fetch( `https://data.cityofchicago.org/resource/ydr8-5enu.json?$select=count(*),sum(reported_cost)&$where=${where}`, { headers: { "User-Agent": UA }, signal: AbortSignal.timeout(10000) }, ); if (!res.ok) return null; const j = (await res.json()) as any[]; const row = j?.[0] || {}; return { count: parseInt(row.count || "0", 10), total_value: parseFloat(row.sum_reported_cost || "0"), }; } catch { return null; } } // ─── Site Context (Chicago TIF + landmark + community area) ──────── // All Socrata. Lookup by address (street_number + street_name where // available, lat/long otherwise). Bundle into a single brief so the // UI gets one tile instead of 3. const TIF_DATASET = "imgn-2suh"; // TIF District Programming 2022-2026 const LANDMARK_DATASET = "habu-n236"; // Individual Landmarks - Map export async function fetchSiteContext(address: string): Promise { const now = new Date().toISOString(); const cleaned = address.trim().replace(/\s+/g, " ").toUpperCase(); const cached = await cacheGet(cleaned, "site_context"); if (cached) return cached; if (!cleaned) { return { source: "chicago_socrata", fetched_at: now, searched_address: address, status: "no_match", }; } // Pull lat/long from the assessor record (we already query it). const owner = await fetchPropertyOwner(address); let lat: string | undefined, lon: string | undefined, ward: string | undefined; if (owner.status === "ok") { lat = owner.latitude; lon = owner.longitude; ward = owner.ward; } // Community area number → name table is public on Chicago.gov but // we'd need it static; ship without name for now (number suffices). const safe = cleaned.replace(/'/g, "''"); const landmarkUrl = `https://data.cityofchicago.org/resource/${LANDMARK_DATASET}.json?$where=upper(address)%20like%20upper('%25${encodeURIComponent(safe)}%25')&$limit=1`; let isLandmark = false; let landmarkName: string | undefined; try { const r = await fetch(landmarkUrl, { headers: { "User-Agent": UA }, signal: AbortSignal.timeout(8000), }); if (r.ok) { const rows = (await r.json()) as any[]; if (rows && rows.length) { isLandmark = true; landmarkName = rows[0].landmark_name || rows[0].name; } } } catch {/* non-fatal */} // TIF polygon containment + CTA distance + nearby permits — all // depend on having lat/long from the Cook County Assessor. let inTif = false; let tifName: string | undefined; let ctaName: string | undefined; let ctaLines: string | undefined; let ctaDist: number | undefined; let nearbyCount: number | undefined; let nearbyValue: number | undefined; if (lat && lon) { const latNum = parseFloat(lat); const lonNum = parseFloat(lon); if (!isNaN(latNum) && !isNaN(lonNum)) { // All three geo lookups in parallel const [tif, cta, nearby] = await Promise.all([ findTifDistrict(lonNum, latNum), nearestCtaStation(latNum, lonNum), fetchNearbyPermits(latNum, lonNum), ]); if (tif) { inTif = true; tifName = tif.name; } if (cta) { ctaName = cta.station.station_name; ctaLines = cta.station.lines.join("/"); ctaDist = Math.round(cta.dist); } if (nearby) { nearbyCount = nearby.count; nearbyValue = nearby.total_value; } } } const brief: SiteContextBrief = { source: "chicago_socrata", fetched_at: now, searched_address: address, status: "ok", in_tif_district: inTif, tif_district_name: tifName, is_landmark: isLandmark, landmark_name: landmarkName, community_area: undefined, ward, latitude: lat, longitude: lon, nearest_cta_station: ctaName, nearest_cta_lines: ctaLines, nearest_cta_distance_m: ctaDist, nearby_permits_90d: nearbyCount, nearby_permits_value_90d: nearbyValue, }; await cacheSet(cleaned, "site_context", brief); return brief; } // ─── MBE/WBE/DBE diversity certification ─────────────────────────── // Chicago Dept of Procurement Economic Inclusion Certified Vendors. const DIVERSITY_DATASET = "2iq3-bugw"; export async function fetchDiversityCerts(name: string): Promise { const now = new Date().toISOString(); const cached = await cacheGet(normalizeEntityName(name), "diversity"); if (cached) return cached; const safe = name.replace(/'/g, "''"); const url = `https://data.cityofchicago.org/resource/${DIVERSITY_DATASET}.json?$where=upper(legal_name)%20like%20upper('%25${encodeURIComponent(safe)}%25')&$limit=10`; try { const r = await fetch(url, { headers: { "User-Agent": UA }, signal: AbortSignal.timeout(8000), }); if (!r.ok || r.status === 404) { // Dataset listed in catalog but resource endpoint 404s — known issue. const skeletal: DiversityCertBrief = { source: "chicago_dps_economic_inclusion", fetched_at: now, searched_name: name, status: "needs_setup", reason: "Chicago MBE/WBE/DBE datasets (2iq3-bugw, 69yt-tb5j, ci93-uc8s, 8dxf-6ahp) are catalog-listed but the resource endpoint returns 404. Likely archived or auth-gated. Path: contact Chicago DPS for alternate access OR use the procurement Intent-to-Award/Execute datasets which DO work.", certifications: [], }; await cacheSet(normalizeEntityName(name), "diversity", skeletal, 7 * 24 * 60 * 60 * 1000); return skeletal; } const rows = (await r.json()) as any[]; const certs = (rows || []).map((row) => ({ category: row.certification || row.classification || "?", expiration: row.certification_expiration_date || row.expiration_date, type: row.business_type || row.type, })); const brief: DiversityCertBrief = { source: "chicago_dps_economic_inclusion", fetched_at: now, searched_name: name, status: certs.length > 0 ? "ok" : "no_match", certifications: certs, source_url: `https://data.cityofchicago.org/resource/${DIVERSITY_DATASET}.json`, }; await cacheSet(normalizeEntityName(name), "diversity", brief); return brief; } catch (e) { return { source: "chicago_dps_economic_inclusion", fetched_at: now, searched_name: name, status: "needs_setup", reason: `dataset access error: ${(e as Error).message}`, certifications: [], error: (e as Error).message, }; } } // ─── News mentions via Google News RSS ───────────────────────────── // news.google.com/rss/search?q={name} — free, no auth. // Returns RSS XML; parse entries for title/link/pubDate. export async function fetchNewsMentions(name: string): Promise { const now = new Date().toISOString(); const cached = await cacheGet(normalizeEntityName(name), "news"); if (cached) return cached; if (!name || name.length < 5) { return { source: "google_news_rss", fetched_at: now, searched_name: name, status: "no_match", total_mentions: 0, recent_headlines: [], }; } const q = encodeURIComponent(`"${name}"`); const url = `https://news.google.com/rss/search?q=${q}&hl=en-US&gl=US&ceid=US:en`; try { const r = await fetch(url, { headers: { "User-Agent": "Mozilla/5.0 lakehouse-copilot" }, signal: AbortSignal.timeout(10000), }); if (!r.ok) throw new Error(`news HTTP ${r.status}`); const xml = await r.text(); const items = [...xml.matchAll(/([\s\S]*?)<\/item>/g)]; const headlines = items.slice(0, 5).map((m) => { const inner = m[1]; const get = (tag: string) => { const re = new RegExp(`<${tag}[^>]*>([\\s\\S]*?)<\\/${tag}>`); const mm = inner.match(re); if (!mm) return ""; return mm[1].replace(//g, "").trim(); }; const title = get("title"); const link = get("link"); const pubDate = get("pubDate"); const sourceMatch = inner.match(/]*>([\s\S]*?)<\/source>/); const src = sourceMatch ? sourceMatch[1].replace(//g, "").trim() : ""; return { title, source: src, date: pubDate, url: link }; }).filter((h) => h.title); const brief: NewsBrief = { source: "google_news_rss", fetched_at: now, searched_name: name, status: headlines.length > 0 ? "ok" : "no_match", total_mentions: items.length, recent_headlines: headlines, }; // 6h TTL — news cycles fast await cacheSet(normalizeEntityName(name), "news", brief, 6 * 60 * 60 * 1000); return brief; } catch (e) { return { source: "google_news_rss", fetched_at: now, searched_name: name, status: "error", total_mentions: 0, recent_headlines: [], error: (e as Error).message, }; } } // ─── BLS construction employment (Chicago MSA) ───────────────────── // Single shared cache keyed on the series ID since it's macro context. const BLS_CHI_CONSTRUCTION_SERIES = "SMU17169802000000001"; let _blsCache: BlsTrendBrief | null = null; let _blsCacheAt = 0; const BLS_CACHE_TTL_MS = 6 * 60 * 60 * 1000; export async function fetchBlsConstructionTrend(): Promise { if (_blsCache && Date.now() - _blsCacheAt < BLS_CACHE_TTL_MS) return _blsCache; const now = new Date().toISOString(); const thisYear = new Date().getFullYear(); try { const res = await fetch("https://api.bls.gov/publicAPI/v1/timeseries/data/", { method: "POST", headers: { "Content-Type": "application/json", "User-Agent": UA }, body: JSON.stringify({ seriesid: [BLS_CHI_CONSTRUCTION_SERIES], startyear: String(thisYear - 2), endyear: String(thisYear), }), signal: AbortSignal.timeout(12000), }); if (!res.ok) throw new Error(`bls HTTP ${res.status}`); const j = (await res.json()) as any; const series = j?.Results?.series?.[0]; const data = (series?.data || []) as Array<{ year: string; period: string; value: string }>; if (data.length === 0) throw new Error("no data"); const obs = data.map((d) => ({ period: `${d.year}-${d.period}`, value: parseFloat(d.value), })); // BLS returns most-recent-first const latest = obs[0]; const monthAgo = obs[1]; const yearAgo = obs[12]; const mom = monthAgo ? ((latest.value - monthAgo.value) / monthAgo.value) * 100 : null; const yoy = yearAgo ? ((latest.value - yearAgo.value) / yearAgo.value) * 100 : null; const trend: BlsTrendBrief["trend"] = yoy === null ? "unknown" : yoy > 1.5 ? "growing" : yoy < -1.5 ? "declining" : "stable"; const brief: BlsTrendBrief = { source: "bls.gov", fetched_at: now, series_id: BLS_CHI_CONSTRUCTION_SERIES, status: "ok", latest, yoy_change_pct: yoy, mom_change_pct: mom, recent: obs.slice(0, 6), trend, }; _blsCache = brief; _blsCacheAt = Date.now(); return brief; } catch (e) { return { source: "bls.gov", fetched_at: now, series_id: BLS_CHI_CONSTRUCTION_SERIES, status: "error", latest: null, yoy_change_pct: null, mom_change_pct: null, recent: [], trend: "unknown", error: (e as Error).message, }; } } // ─── News sentiment scoring ──────────────────────────────────────── // Run this over the existing NewsBrief headlines. Cheap keyword-based // classifier — not LLM-grade, but catches the obvious "lawsuit / // fraud / OSHA fine" vs "awarded / expansion / wins" patterns. const SENTIMENT_NEG = [ "lawsuit","sued","fraud","fines?","fined","penalty","penalties", "violations?","investigation","investigated","indicted","charged", "guilty","arrest","arrested","bankrupt(cy)?","insolvenc?y", "debt","default","fired","layoffs?","fired","quit","resigned", "scandal","whistleblower","theft","embezzl","kickback","bribe", "complaint","grievanc","strike","walkout","picket","accident", "death","fatal","killed","injur","hospitaliz","amputation","fall", "collapse","collapsed","fire","explosion","shut.?down","stop.?work", "delay","delays","behind.?schedule","over.?budget","cancel(l?)ed", "subpoena","audit","misconduct","harass","discriminat","retaliat", "evict","foreclos","levied", ]; const SENTIMENT_POS = [ "award(?:ed|s)","won","wins","wins?\\b","top","best","gain(?:ed|s)?", "expand(?:ed|s|ing)?","expansion","contract","selected","chosen", "partnership","invest(?:ed|s|ing|ment)?","launch","launched", "milestone","completed?","record","strong","grew","grow(?:th|ing)", "promoted","hired","appoint(?:ed|s)?","ribbon","groundbreaking", "innovation","new\\s+headquarters","achievement","recogni[zs]ed", "leader(?:ship)?","sustainab","green","clean.?energy", ]; const NEG_RE = new RegExp("\\b(" + SENTIMENT_NEG.join("|") + ")\\b", "i"); const POS_RE = new RegExp("\\b(" + SENTIMENT_POS.join("|") + ")\\b", "i"); export function scoreNewsSentiment(news: NewsBrief): NewsSentiment { const headlines = news.recent_headlines || []; let pos = 0, neg = 0; const flagged: NewsSentiment["flagged_headlines"] = []; for (const h of headlines) { const t = h.title || ""; const negMatches: string[] = []; const posMatches: string[] = []; let m; const nReg = new RegExp(NEG_RE, "gi"); while ((m = nReg.exec(t)) !== null) { if (negMatches.length < 3) negMatches.push(m[1]); if (negMatches.length > 5) break; } const pReg = new RegExp(POS_RE, "gi"); while ((m = pReg.exec(t)) !== null) { if (posMatches.length < 3) posMatches.push(m[1]); if (posMatches.length > 5) break; } if (negMatches.length > posMatches.length) { neg++; flagged.push({ title: t, polarity: "neg", reasons: negMatches }); } else if (posMatches.length > negMatches.length) { pos++; flagged.push({ title: t, polarity: "pos", reasons: posMatches }); } } const total = headlines.length || 1; const score = (pos - neg) / total; return { score: Math.max(-1, Math.min(1, score)), positive: pos, negative: neg, neutral: total - pos - neg, flagged_headlines: flagged.slice(0, 5), }; } // ─── Placeholder fetchers for sources that need scraping work ────── export async function fetchOshaSirBrief(name: string): Promise { return { source: "osha.gov/severeinjury", fetched_at: new Date().toISOString(), searched_name: name, status: "needs_setup", reason: "OSHA Severe Injury Reports CSV is bot-protected at dol.gov (Imperva challenge). Path forward: (a) human-mediated periodic CSV download into /data/_entity_cache/sir.csv, then local lookup, or (b) browser-automation scraper with CAPTCHA handling.", }; } export async function fetchLiensBrief(address: string): Promise { // Cook County Recorder of Deeds merged into Cook County Clerk in 2020. // Public search portal: crs.cookcountyclerkil.gov (HTML form, JSESSIONID-based). // Resource catalog at datacatalog.cookcountyil.gov returns 404 on most lien // dataset IDs (xfev-8smz UCC liens specifically, also xrzj-c8ez code violations). // Path: HTML scrape of crs.cookcountyclerkil.gov with PIN→search→paginate. // Queued — needs ~150 LOC of form-state parsing. return { source: "cook_county_recorder", fetched_at: new Date().toISOString(), searched_address: address, status: "needs_setup", reason: "Cook County Clerk recordings portal (crs.cookcountyclerkil.gov) needs JSESSIONID + form POST scrape. Socrata catalog UCC dataset (xfev-8smz) returns 404 on resource endpoint. Queued for next batch.", }; } // ─── NLRB cases (real scraper) ───────────────────────────────────── // Public search at apps.nlrb.gov/eservice/dailyclosed.aspx and // nlrb.gov/search/case. The /search/case page returns plain HTML. export async function fetchNlrbBriefReal(name: string): Promise { const now = new Date().toISOString(); const cached = await cacheGet(normalizeEntityName(name), "nlrb"); if (cached) return cached; const q = encodeURIComponent(name); const url = `https://www.nlrb.gov/search/case?search_term=${q}`; try { const r = await fetch(url, { headers: { "User-Agent": UA }, signal: AbortSignal.timeout(12000), }); if (!r.ok) throw new Error(`nlrb HTTP ${r.status}`); const html = await r.text(); // Cases appear in result blocks — count occurrences of case-detail links const caseLinks = [...html.matchAll(/]+href="\/case\/([0-9A-Z\-]+)"/g)]; const cases = caseLinks.slice(0, 5).map((m) => ({ case_number: m[1], case_type: "?", date: "", status: "", })); // Total count appears near top — pattern "X results" const totalMatch = html.match(/(\d[\d,]*)\s+result/i); const total = totalMatch ? parseInt(totalMatch[1].replace(/,/g, ""), 10) : caseLinks.length; const brief: NlrbBrief = { source: "nlrb.gov", fetched_at: now, searched_name: name, status: cases.length > 0 ? "ok" : "no_match", total_cases: total, recent_cases: cases, }; await cacheSet(normalizeEntityName(name), "nlrb", brief, 24 * 60 * 60 * 1000); return brief; } catch (e) { return { source: "nlrb.gov", fetched_at: now, searched_name: name, status: "needs_setup", reason: `nlrb fetch error: ${(e as Error).message}`, }; } } // ─── Project Index Score (matrix aggregation) ────────────────────── // Walks every signal across owner/contractors and produces a single // 0-100 index anchored at 50 neutral. Each contribution is auditable. // Weights are documentation-grade — adjust over time as we learn which // signals actually correlate with project success/delay. export function scoreProject( property: PermitEntityBrief["property"], entities: EntityBrief[], macro: BlsTrendBrief | null = null, ): ProjectIndexScore { const cs: SignalContribution[] = []; let partial = false; // Macro: Chicago construction employment YoY. Affects every permit // proportionally — labor-shrinking market raises filling difficulty, // labor-growing market signals confidence. if (macro?.status === "ok" && macro.yoy_change_pct !== null) { const w = 3; const yoy = macro.yoy_change_pct; const dir = yoy > 1.5 ? 1 : yoy < -1.5 ? -1 : 0; if (dir !== 0) { cs.push({ signal: "macro_chi_construction_employment", weight: w, direction: dir, raw: yoy, contribution: dir * w, note: `Chicago MSA construction employment ${yoy >= 0 ? "+" : ""}${yoy.toFixed(1)}% YoY (BLS) — labor market ${macro.trend}`, }); } } // ─── Property-level signals ─── if (property.violations?.status === "ok") { const v = property.violations; if (v.stop_work_orders > 0) { const w = 10; cs.push({ signal: "stop_work_orders", weight: w, direction: -1, raw: v.stop_work_orders, contribution: -w * Math.min(v.stop_work_orders, 3), note: `${v.stop_work_orders} stop-work order${v.stop_work_orders === 1 ? "" : "s"} on this property`, }); } if (v.open_violations > 0) { const w = 6; cs.push({ signal: "open_property_violations", weight: w, direction: -1, raw: v.open_violations, contribution: -w * Math.min(v.open_violations, 4) / 2, note: `${v.open_violations} open building violation${v.open_violations === 1 ? "" : "s"} at the address`, }); } else if (v.total_violations === 0) { const w = 3; cs.push({ signal: "property_clean", weight: w, direction: 1, raw: 0, contribution: w, note: "No building violations on record", }); } } // TIF district = public-subsidy zone. Project has financial backing // from the city's tax-increment financing pot. Mild positive signal // because public oversight + structured funding usually means the // project survives funding gaps. Not a blanket good — TIFs do fail — // but worth noting. if (property.site_context?.in_tif_district) { const w = 4; cs.push({ signal: "in_tif_district", weight: w, direction: 1, raw: property.site_context.tif_district_name, contribution: w, note: `Site is inside TIF "${property.site_context.tif_district_name}" — public tax-increment financing backing`, }); } // Transit access — workers can commute via L without parking. // <800m is "right at a station," <1500m is "walkable," beyond that // crews are car/shuttle dependent. if (property.site_context?.nearest_cta_distance_m !== undefined) { const m = property.site_context.nearest_cta_distance_m; if (m <= 800) { const w = 2; cs.push({ signal: "transit_access_strong", weight: w, direction: 1, raw: m, contribution: w, note: `Site is ${m}m from CTA ${property.site_context.nearest_cta_station} (${property.site_context.nearest_cta_lines}) — workers can commute by L`, }); } else if (m <= 1500) { // Neutral — walkable but not adjacent } else { const w = 1; cs.push({ signal: "transit_access_weak", weight: w, direction: -1, raw: m, contribution: -w, note: `Site is ${(m / 1000).toFixed(1)}km from nearest CTA stop — crew transport burden`, }); } } // Labor competition — many active permits within 0.5mi means crews // are already booked elsewhere. >5 permits in 90d is "saturated." if ( property.site_context?.nearby_permits_90d !== undefined && property.site_context.nearby_permits_90d > 5 ) { const n = property.site_context.nearby_permits_90d; const w = 3; cs.push({ signal: "labor_competition_high", weight: w, direction: -1, raw: n, contribution: -Math.min(w, n / 5), note: `${n} other permits active within 0.5mi (last 90d) — crew competition`, }); } if (property.site_context?.is_landmark) { const w = 3; cs.push({ signal: "landmark_status", weight: w, direction: -1, raw: property.site_context.landmark_name, contribution: -w, note: `Landmark designation${property.site_context.landmark_name ? ` (${property.site_context.landmark_name})` : ""} — preservation review extends timeline`, }); } if (property.owner?.status === "ok") { if ( property.owner.mailing_state && property.owner.mailing_state !== "IL" ) { const w = 2; cs.push({ signal: "owner_out_of_state", weight: w, direction: -1, raw: property.owner.mailing_state, contribution: -w, note: `Owner mails out of state (${property.owner.mailing_state}) — slower local accountability`, }); } else { const w = 2; cs.push({ signal: "owner_local", weight: w, direction: 1, raw: "IL", contribution: w, note: "Owner mails locally — faster accountability", }); } } // ─── Per-entity signals ─── for (const e of entities) { const tag = `[${e.display_name}]`; if (e.osha?.status === "ok") { const o = e.osha; if (o.inspection_count > 10) { const w = 6; cs.push({ signal: "osha_high_inspection_count", weight: w, direction: -1, raw: o.inspection_count, contribution: -w, note: `${tag} ${o.inspection_count} OSHA inspections nationally`, }); } else if (o.inspection_count === 0) { const w = 3; cs.push({ signal: "osha_clean", weight: w, direction: 1, raw: 0, contribution: w, note: `${tag} no OSHA inspections on record`, }); } if (o.most_recent_date) { const ageDays = (Date.now() - Date.parse(o.most_recent_date)) / 86400000; if (ageDays < 90) { const w = 7; cs.push({ signal: "osha_recent_action", weight: w, direction: -1, raw: o.most_recent_date, contribution: -w, note: `${tag} OSHA inspection ${Math.round(ageDays)}d ago`, }); } } } else if (e.osha?.status === "error") { partial = true; } if (e.history?.status === "ok") { if (e.history.trend === "new") { const w = 8; cs.push({ signal: "new_entity", weight: w, direction: -1, raw: e.history.permits_historical_total, contribution: -w, note: `${tag} only ${e.history.permits_historical_total} permits ever — possible LLC-shuffle signature`, }); } else if (e.history.trend === "growing") { const w = 3; cs.push({ signal: "activity_growing", weight: w, direction: 1, raw: e.history.permits_last_180d, contribution: w, note: `${tag} growing Chicago footprint (${e.history.permits_last_180d} in 180d)`, }); } else if (e.history.trend === "declining") { const w = 4; cs.push({ signal: "activity_declining", weight: w, direction: -1, raw: e.history.permits_last_180d, contribution: -w, note: `${tag} declining Chicago footprint`, }); } } if (e.federal?.status === "ok" && e.federal.total_awards_value > 0) { const fedM = e.federal.total_awards_value / 1e6; const w = 5; cs.push({ signal: "federal_contract_history", weight: w, direction: 1, raw: e.federal.total_awards_value, contribution: Math.min(w, Math.log10(fedM + 1) * 2), note: `${tag} $${fedM.toFixed(0)}M federal contracts — vetted compliance`, }); } // SVEP — the heaviest-weight negative signal we have. OSHA placed // this contractor in the Severe Violator Enforcement Program after // willful or repeat violations. Real money + worker safety risk. if (e.svep?.flagged) { const w = 10; cs.push({ signal: "osha_svep_listed", weight: w, direction: -1, raw: e.svep.matched_entries.map((m) => m.name).join(" | "), contribution: -w * 2, note: `${tag} OSHA SVEP-listed (Severe Violator Enforcement Program) — formally flagged worst-actor`, }); } if (e.parent_link?.status === "ok" && e.parent_link.parent_ticker) { const w = 3; cs.push({ signal: "public_parent_chain", weight: w, direction: 1, raw: e.parent_link.parent_ticker, contribution: w, note: `${tag} traceable to public parent ${e.parent_link.parent_ticker}`, }); } // News sentiment over the recent headlines. Negative bias means // the press cycle around this firm is dominated by lawsuits / // accidents / fines. if (e.news_sentiment) { const ns = e.news_sentiment; if (ns.negative > ns.positive && ns.negative > 0) { const w = 5; cs.push({ signal: "news_sentiment_negative", weight: w, direction: -1, raw: ns.score, contribution: -Math.min(w, ns.negative * 1.5), note: `${tag} negative news cycle (${ns.negative} flagged: ${ns.flagged_headlines.filter((h) => h.polarity === "neg").slice(0, 2).map((h) => h.reasons.join("/")).join(", ")})`, }); } else if (ns.positive > ns.negative && ns.positive >= 2) { const w = 2; cs.push({ signal: "news_sentiment_positive", weight: w, direction: 1, raw: ns.score, contribution: w, note: `${tag} positive news cycle (${ns.positive} flagged on awards/expansion)`, }); } } if (e.debarment?.status === "needs_setup") partial = true; if (e.nlrb?.status === "needs_setup") partial = true; if (e.ilsos?.status === "source_unreachable") partial = true; } // Net score from 50 baseline. Clamp 0..100. const net = cs.reduce((a, c) => a + c.contribution, 0); const score = Math.max(0, Math.min(100, Math.round(50 + net))); const band: ProjectIndexScore["band"] = score < 30 ? "red" : score < 45 ? "amber" : score <= 55 ? "neutral" : score <= 75 ? "green" : "strong"; // Sort contributions by absolute weight, biggest movers first cs.sort((a, b) => Math.abs(b.contribution) - Math.abs(a.contribution)); return { score, band, contributions: cs, partial }; } // ─── Public: build a brief for a single permit ───────────────────── type PermitInput = { permit_id?: string; address?: string; work_type?: string; contact_1_name?: string; contact_1_type?: string; contact_2_name?: string; contact_2_type?: string; }; export async function buildPermitBrief( permit: PermitInput, opts: { fetchIlsos?: boolean; fetchOsha?: boolean; fetchTicker?: boolean; fetchHistory?: boolean; } = {}, ): Promise { const wantOsha = opts.fetchOsha !== false; const wantIlsos = opts.fetchIlsos !== false; const wantTicker = opts.fetchTicker !== false; const wantHistory = opts.fetchHistory !== false; const rawContacts: { name: string; role: string }[] = []; if (permit.contact_1_name?.trim()) { rawContacts.push({ name: permit.contact_1_name.trim(), role: permit.contact_1_type || "CONTACT_1", }); } if (permit.contact_2_name?.trim()) { rawContacts.push({ name: permit.contact_2_name.trim(), role: permit.contact_2_type || "CONTACT_2", }); } // Dedupe by normalized name — same firm often appears as both contacts. const seen = new Set(); const contacts: { name: string; role: string }[] = []; for (const c of rawContacts) { const key = normalizeEntityName(c.name); if (!key || seen.has(key)) continue; seen.add(key); contacts.push(c); } // Fetch all signals in parallel per contact (4 concurrent outbound // requests max; OSHA gate serializes its own, others go straight). const entities: EntityBrief[] = []; for (const c of contacts) { // 11 parallel calls per entity. OSHA throttles internally, the rest // fan straight out. ~3-5s end-to-end per entity on cold cache. const [osha, ilsos, stock, history, debarment, parent_link, federal, nlrb, diversity, news, osha_sir, svep] = await Promise.all([ wantOsha ? fetchOshaBrief(c.name) : Promise.resolve(null), wantIlsos ? fetchIlsosBrief(c.name) : Promise.resolve(null), wantTicker ? fetchTickerBrief(c.name) : Promise.resolve(null), wantHistory ? fetchContractorHistory(c.name) : Promise.resolve(null), fetchDebarmentBrief(c.name), fetchParentLink(c.name), fetchFederalContracts(c.name), fetchNlrbBriefReal(c.name), fetchDiversityCerts(c.name), fetchNewsMentions(c.name), fetchOshaSirBrief(c.name), fetchSvepBrief(c.name), ]); const news_sentiment = news ? scoreNewsSentiment(news) : null; entities.push({ key: normalizeEntityName(c.name), display_name: c.name, role: c.role, ticker: entityTicker(c.name), osha, ilsos, stock, history, debarment, parent_link, federal, nlrb, diversity, news, news_sentiment, osha_sir, svep, risk: scoreEntity(osha, ilsos, history), }); } // Flatten the ticker portfolio across entities, dedupe by ticker // symbol, sort by cap_proxy descending (most-profitable related // company first — Project Index "most-profitable beneficiary first"). const tickersByKey = new Map(); for (const e of entities) { if (e.stock?.status === "ok" && e.stock.ticker) { tickersByKey.set(e.stock.ticker, e.stock); } } const tickers = [...tickersByKey.values()].sort( (a, b) => (b.cap_proxy ?? 0) - (a.cap_proxy ?? 0), ); // Property-side signals: owner, violations, site context, liens placeholder. const [owner, violations, site_context, liens] = await Promise.all([ permit.address ? fetchPropertyOwner(permit.address) : Promise.resolve(null), permit.address ? fetchPropertyViolations(permit.address) : Promise.resolve(null), permit.address ? fetchSiteContext(permit.address) : Promise.resolve(null), permit.address ? fetchLiensBrief(permit.address) : Promise.resolve(null), ]); const union = unionsForWorkType(permit.work_type); const property = { address: permit.address || "", ticker: addressTicker(permit.address || ""), owner, violations, union, site_context, liens, }; // Macro context (BLS Chicago construction employment) — single fetch // per process-cache window. Run in parallel with the per-entity calls. const macro = await fetchBlsConstructionTrend(); const index_score = scoreProject(property, entities, macro); return { permit_id: permit.permit_id || "", property, entities, tickers, macro, index_score, // Roadmap: now-shorter list as more sources go live. Items pending // for batch 4-5: SEC Exhibit 21, debarment lists, USASpending, // mechanics liens, NLRB+OSHA-Severe-Injury, contractor drill-down. roadmap: [ "SEC Exhibit 21 full index — walk every 10-K Exhibit 21 to build a complete subsidiary→parent tree (curated map covers top 12 GCs today)", "SAM.gov v3 API — register at sam.gov for free key; OR human-mediated CSV download into local cache", "IDOL prevailing-wage debarment list — HTML/PDF scrape from labor.illinois.gov", "OSHA Severe Injury Reports — bot-protected at dol.gov; needs human-mediated CSV refresh OR browser-automation", "Cook County Recorder mechanics liens — HTML scrape with PIN-based pagination", "TIF polygon containment check — needs lat/long-in-polygon (turf.js) to confirm in_tif_district", "Community area name table — static lookup from Chicago number → name", "BLS QCEW + FRED commodity prices — macroeconomic context", "PACER bankruptcy + civil litigation — federal court filings (free for opinion search)", "Cook County Treasurer — property tax delinquency", "NYC DOB + LA LADBS — cross-city contractor footprint", "LLC-shuffle detector — principal+agent fingerprint matching (blocked on ILSOS access)", ], generated_at: new Date().toISOString(), }; }