Robust SQL extraction: handles explanations, markdown, prefixes

clean_sql now uses 3 strategies in priority order:
1. Extract from ```sql...``` markdown blocks
2. Find first SELECT/WITH/INSERT statement in text
3. Strip leading "sql" keyword fallback

Tested against 5 real model output patterns:
- Clean SQL ✓
- "sql" prefixed ✓
- Markdown fenced ✓
- Explanation before ```sql block ✓
- Explanation with SELECT buried in text ✓

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
root 2026-03-27 20:42:11 -05:00
parent 34c03894ae
commit 0bd753294b
5 changed files with 195 additions and 13 deletions

View File

@ -1025,23 +1025,47 @@ fn ResultsTable(response: QueryResponse) -> Element {
}
}
/// Clean AI-generated SQL: strip markdown fences, leading "sql" keyword, explanations.
/// Clean AI-generated SQL: extract only the SQL query, strip everything else.
fn clean_sql(raw: &str) -> String {
let mut s = raw.trim().to_string();
// Remove markdown code fences
s = s.trim_start_matches("```sql").trim_start_matches("```").trim_end_matches("```").trim().to_string();
// Remove leading "sql" keyword on its own line
let lines: Vec<&str> = s.lines().collect();
if let Some(first) = lines.first() {
if first.trim().eq_ignore_ascii_case("sql") || first.trim().eq_ignore_ascii_case("sql;") {
s = lines[1..].join("\n").trim().to_string();
let s = raw.trim();
// Strategy 1: If there's a ```sql...``` block, extract just that
if let Some(start) = s.find("```sql") {
let after = &s[start + 6..];
if let Some(end) = after.find("```") {
return after[..end].trim().to_string();
}
}
// If the model added explanation after the SQL, keep only up to the first semicolon line
if let Some(pos) = s.find(";\n\n") {
s = s[..pos + 1].to_string();
if let Some(start) = s.find("```") {
let after = &s[start + 3..];
if let Some(end) = after.find("```") {
let inner = after[..end].trim();
// Skip leading "sql" keyword
let inner = inner.strip_prefix("sql").map(|s| s.trim_start()).unwrap_or(inner);
return inner.to_string();
}
}
s
// Strategy 2: Find the first SELECT/WITH/INSERT/UPDATE/DELETE statement
let upper = s.to_uppercase();
for keyword in &["SELECT", "WITH", "INSERT", "UPDATE", "DELETE"] {
if let Some(pos) = upper.find(keyword) {
let sql_part = &s[pos..];
// Take up to the first semicolon (or end)
let end = sql_part.find(';').map(|p| p + 1).unwrap_or(sql_part.len());
return sql_part[..end].trim().to_string();
}
}
// Strategy 3: Strip leading "sql" and clean up
let mut result = s.to_string();
let lines: Vec<&str> = result.lines().collect();
if let Some(first) = lines.first() {
if first.trim().eq_ignore_ascii_case("sql") {
result = lines[1..].join("\n").trim().to_string();
}
}
result
}
fn format_cell(val: Option<&serde_json::Value>) -> String {

View File

@ -0,0 +1,65 @@
{
"id": "3b12ae24-17d4-4325-92bf-f3155982f3bf",
"name": "users",
"schema_fingerprint": "30c0e31f0963e6f4af02131bbb9ea246fbbd068b849b833565a4b28211fbc90b",
"objects": [
{
"bucket": "data",
"key": "datasets/users.parquet",
"size_bytes": 2012,
"created_at": "2026-03-28T01:38:59.904968123Z"
}
],
"created_at": "2026-03-28T01:38:59.904968680Z",
"updated_at": "2026-03-28T01:38:59.905221340Z",
"description": "",
"owner": "",
"sensitivity": null,
"columns": [
{
"name": "id",
"data_type": "Int32",
"sensitivity": null,
"description": "",
"is_pii": false
},
{
"name": "username",
"data_type": "Utf8",
"sensitivity": null,
"description": "",
"is_pii": false
},
{
"name": "password_hash",
"data_type": "Utf8",
"sensitivity": null,
"description": "",
"is_pii": false
},
{
"name": "role",
"data_type": "Utf8",
"sensitivity": null,
"description": "",
"is_pii": false
},
{
"name": "created_at",
"data_type": "Utf8",
"sensitivity": null,
"description": "",
"is_pii": false
}
],
"lineage": {
"source_system": "postgresql",
"source_file": "127.0.0.1:5432/knowledge_base.users",
"ingest_job": "pg-import-1774661939904",
"ingest_timestamp": "2026-03-28T01:38:59.904968123Z",
"parent_datasets": []
},
"freshness": null,
"tags": [],
"row_count": 1
}

View File

@ -0,0 +1,93 @@
{
"id": "3c2579b4-f3f3-4875-95fa-58d8b49ad94c",
"name": "meta_runs",
"schema_fingerprint": "68f2c0d7a3ceb0aaa3c17c64900704519c72d213161bc9e5179c42ee53f6d0df",
"objects": [
{
"bucket": "data",
"key": "datasets/meta_runs.parquet",
"size_bytes": 729773,
"created_at": "2026-03-28T01:38:57.380576453Z"
}
],
"created_at": "2026-03-28T01:38:57.380577270Z",
"updated_at": "2026-03-28T01:38:57.380846224Z",
"description": "",
"owner": "",
"sensitivity": null,
"columns": [
{
"name": "id",
"data_type": "Int32",
"sensitivity": null,
"description": "",
"is_pii": false
},
{
"name": "pipeline_id",
"data_type": "Int32",
"sensitivity": null,
"description": "",
"is_pii": false
},
{
"name": "iteration",
"data_type": "Int32",
"sensitivity": null,
"description": "",
"is_pii": false
},
{
"name": "stage_results",
"data_type": "Utf8",
"sensitivity": null,
"description": "",
"is_pii": false
},
{
"name": "final_output",
"data_type": "Utf8",
"sensitivity": null,
"description": "",
"is_pii": false
},
{
"name": "score",
"data_type": "Float64",
"sensitivity": null,
"description": "",
"is_pii": false
},
{
"name": "model_config",
"data_type": "Utf8",
"sensitivity": null,
"description": "",
"is_pii": false
},
{
"name": "duration_ms",
"data_type": "Int32",
"sensitivity": null,
"description": "",
"is_pii": false
},
{
"name": "created_at",
"data_type": "Utf8",
"sensitivity": null,
"description": "",
"is_pii": false
}
],
"lineage": {
"source_system": "postgresql",
"source_file": "127.0.0.1:5432/knowledge_base.meta_runs",
"ingest_job": "pg-import-1774661937380",
"ingest_timestamp": "2026-03-28T01:38:57.380576453Z",
"parent_datasets": []
},
"freshness": null,
"tags": [],
"row_count": 38
}

Binary file not shown.

BIN
data/datasets/users.parquet Normal file

Binary file not shown.