Robust SQL extraction: handles explanations, markdown, prefixes
clean_sql now uses 3 strategies in priority order: 1. Extract from ```sql...``` markdown blocks 2. Find first SELECT/WITH/INSERT statement in text 3. Strip leading "sql" keyword fallback Tested against 5 real model output patterns: - Clean SQL ✓ - "sql" prefixed ✓ - Markdown fenced ✓ - Explanation before ```sql block ✓ - Explanation with SELECT buried in text ✓ Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
34c03894ae
commit
0bd753294b
@ -1025,23 +1025,47 @@ fn ResultsTable(response: QueryResponse) -> Element {
|
||||
}
|
||||
}
|
||||
|
||||
/// Clean AI-generated SQL: strip markdown fences, leading "sql" keyword, explanations.
|
||||
/// Clean AI-generated SQL: extract only the SQL query, strip everything else.
|
||||
fn clean_sql(raw: &str) -> String {
|
||||
let mut s = raw.trim().to_string();
|
||||
// Remove markdown code fences
|
||||
s = s.trim_start_matches("```sql").trim_start_matches("```").trim_end_matches("```").trim().to_string();
|
||||
// Remove leading "sql" keyword on its own line
|
||||
let lines: Vec<&str> = s.lines().collect();
|
||||
if let Some(first) = lines.first() {
|
||||
if first.trim().eq_ignore_ascii_case("sql") || first.trim().eq_ignore_ascii_case("sql;") {
|
||||
s = lines[1..].join("\n").trim().to_string();
|
||||
let s = raw.trim();
|
||||
|
||||
// Strategy 1: If there's a ```sql...``` block, extract just that
|
||||
if let Some(start) = s.find("```sql") {
|
||||
let after = &s[start + 6..];
|
||||
if let Some(end) = after.find("```") {
|
||||
return after[..end].trim().to_string();
|
||||
}
|
||||
}
|
||||
// If the model added explanation after the SQL, keep only up to the first semicolon line
|
||||
if let Some(pos) = s.find(";\n\n") {
|
||||
s = s[..pos + 1].to_string();
|
||||
if let Some(start) = s.find("```") {
|
||||
let after = &s[start + 3..];
|
||||
if let Some(end) = after.find("```") {
|
||||
let inner = after[..end].trim();
|
||||
// Skip leading "sql" keyword
|
||||
let inner = inner.strip_prefix("sql").map(|s| s.trim_start()).unwrap_or(inner);
|
||||
return inner.to_string();
|
||||
}
|
||||
}
|
||||
s
|
||||
|
||||
// Strategy 2: Find the first SELECT/WITH/INSERT/UPDATE/DELETE statement
|
||||
let upper = s.to_uppercase();
|
||||
for keyword in &["SELECT", "WITH", "INSERT", "UPDATE", "DELETE"] {
|
||||
if let Some(pos) = upper.find(keyword) {
|
||||
let sql_part = &s[pos..];
|
||||
// Take up to the first semicolon (or end)
|
||||
let end = sql_part.find(';').map(|p| p + 1).unwrap_or(sql_part.len());
|
||||
return sql_part[..end].trim().to_string();
|
||||
}
|
||||
}
|
||||
|
||||
// Strategy 3: Strip leading "sql" and clean up
|
||||
let mut result = s.to_string();
|
||||
let lines: Vec<&str> = result.lines().collect();
|
||||
if let Some(first) = lines.first() {
|
||||
if first.trim().eq_ignore_ascii_case("sql") {
|
||||
result = lines[1..].join("\n").trim().to_string();
|
||||
}
|
||||
}
|
||||
result
|
||||
}
|
||||
|
||||
fn format_cell(val: Option<&serde_json::Value>) -> String {
|
||||
|
||||
@ -0,0 +1,65 @@
|
||||
{
|
||||
"id": "3b12ae24-17d4-4325-92bf-f3155982f3bf",
|
||||
"name": "users",
|
||||
"schema_fingerprint": "30c0e31f0963e6f4af02131bbb9ea246fbbd068b849b833565a4b28211fbc90b",
|
||||
"objects": [
|
||||
{
|
||||
"bucket": "data",
|
||||
"key": "datasets/users.parquet",
|
||||
"size_bytes": 2012,
|
||||
"created_at": "2026-03-28T01:38:59.904968123Z"
|
||||
}
|
||||
],
|
||||
"created_at": "2026-03-28T01:38:59.904968680Z",
|
||||
"updated_at": "2026-03-28T01:38:59.905221340Z",
|
||||
"description": "",
|
||||
"owner": "",
|
||||
"sensitivity": null,
|
||||
"columns": [
|
||||
{
|
||||
"name": "id",
|
||||
"data_type": "Int32",
|
||||
"sensitivity": null,
|
||||
"description": "",
|
||||
"is_pii": false
|
||||
},
|
||||
{
|
||||
"name": "username",
|
||||
"data_type": "Utf8",
|
||||
"sensitivity": null,
|
||||
"description": "",
|
||||
"is_pii": false
|
||||
},
|
||||
{
|
||||
"name": "password_hash",
|
||||
"data_type": "Utf8",
|
||||
"sensitivity": null,
|
||||
"description": "",
|
||||
"is_pii": false
|
||||
},
|
||||
{
|
||||
"name": "role",
|
||||
"data_type": "Utf8",
|
||||
"sensitivity": null,
|
||||
"description": "",
|
||||
"is_pii": false
|
||||
},
|
||||
{
|
||||
"name": "created_at",
|
||||
"data_type": "Utf8",
|
||||
"sensitivity": null,
|
||||
"description": "",
|
||||
"is_pii": false
|
||||
}
|
||||
],
|
||||
"lineage": {
|
||||
"source_system": "postgresql",
|
||||
"source_file": "127.0.0.1:5432/knowledge_base.users",
|
||||
"ingest_job": "pg-import-1774661939904",
|
||||
"ingest_timestamp": "2026-03-28T01:38:59.904968123Z",
|
||||
"parent_datasets": []
|
||||
},
|
||||
"freshness": null,
|
||||
"tags": [],
|
||||
"row_count": 1
|
||||
}
|
||||
@ -0,0 +1,93 @@
|
||||
{
|
||||
"id": "3c2579b4-f3f3-4875-95fa-58d8b49ad94c",
|
||||
"name": "meta_runs",
|
||||
"schema_fingerprint": "68f2c0d7a3ceb0aaa3c17c64900704519c72d213161bc9e5179c42ee53f6d0df",
|
||||
"objects": [
|
||||
{
|
||||
"bucket": "data",
|
||||
"key": "datasets/meta_runs.parquet",
|
||||
"size_bytes": 729773,
|
||||
"created_at": "2026-03-28T01:38:57.380576453Z"
|
||||
}
|
||||
],
|
||||
"created_at": "2026-03-28T01:38:57.380577270Z",
|
||||
"updated_at": "2026-03-28T01:38:57.380846224Z",
|
||||
"description": "",
|
||||
"owner": "",
|
||||
"sensitivity": null,
|
||||
"columns": [
|
||||
{
|
||||
"name": "id",
|
||||
"data_type": "Int32",
|
||||
"sensitivity": null,
|
||||
"description": "",
|
||||
"is_pii": false
|
||||
},
|
||||
{
|
||||
"name": "pipeline_id",
|
||||
"data_type": "Int32",
|
||||
"sensitivity": null,
|
||||
"description": "",
|
||||
"is_pii": false
|
||||
},
|
||||
{
|
||||
"name": "iteration",
|
||||
"data_type": "Int32",
|
||||
"sensitivity": null,
|
||||
"description": "",
|
||||
"is_pii": false
|
||||
},
|
||||
{
|
||||
"name": "stage_results",
|
||||
"data_type": "Utf8",
|
||||
"sensitivity": null,
|
||||
"description": "",
|
||||
"is_pii": false
|
||||
},
|
||||
{
|
||||
"name": "final_output",
|
||||
"data_type": "Utf8",
|
||||
"sensitivity": null,
|
||||
"description": "",
|
||||
"is_pii": false
|
||||
},
|
||||
{
|
||||
"name": "score",
|
||||
"data_type": "Float64",
|
||||
"sensitivity": null,
|
||||
"description": "",
|
||||
"is_pii": false
|
||||
},
|
||||
{
|
||||
"name": "model_config",
|
||||
"data_type": "Utf8",
|
||||
"sensitivity": null,
|
||||
"description": "",
|
||||
"is_pii": false
|
||||
},
|
||||
{
|
||||
"name": "duration_ms",
|
||||
"data_type": "Int32",
|
||||
"sensitivity": null,
|
||||
"description": "",
|
||||
"is_pii": false
|
||||
},
|
||||
{
|
||||
"name": "created_at",
|
||||
"data_type": "Utf8",
|
||||
"sensitivity": null,
|
||||
"description": "",
|
||||
"is_pii": false
|
||||
}
|
||||
],
|
||||
"lineage": {
|
||||
"source_system": "postgresql",
|
||||
"source_file": "127.0.0.1:5432/knowledge_base.meta_runs",
|
||||
"ingest_job": "pg-import-1774661937380",
|
||||
"ingest_timestamp": "2026-03-28T01:38:57.380576453Z",
|
||||
"parent_datasets": []
|
||||
},
|
||||
"freshness": null,
|
||||
"tags": [],
|
||||
"row_count": 38
|
||||
}
|
||||
BIN
data/datasets/meta_runs.parquet
Normal file
BIN
data/datasets/meta_runs.parquet
Normal file
Binary file not shown.
BIN
data/datasets/users.parquet
Normal file
BIN
data/datasets/users.parquet
Normal file
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user