Robust SQL extraction: handles explanations, markdown, prefixes

clean_sql now uses 3 strategies in priority order: 1. Extract from ```sql...``` markdown blocks 2. Find first SELECT/WITH/INSERT statement in text 3. Strip leading "sql" keyword fallback Tested against 5 real model output patterns: - Clean SQL ✓ - "sql" prefixed ✓ - Markdown fenced ✓ - Explanation before ```sql block ✓ - Explanation with SELECT buried in text ✓ Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-27 20:42:11 -05:00 · 2026-03-27 20:42:11 -05:00 · 0bd753294b
commit 0bd753294b
parent 34c03894ae
5 changed files with 195 additions and 13 deletions
--- a/crates/ui/src/main.rs
+++ b/crates/ui/src/main.rs
@ -1025,23 +1025,47 @@ fn ResultsTable(response: QueryResponse) -> Element {
    }
 }

-/// Clean AI-generated SQL: strip markdown fences, leading "sql" keyword, explanations.
+/// Clean AI-generated SQL: extract only the SQL query, strip everything else.
 fn clean_sql(raw: &str) -> String {
-    let mut s = raw.trim().to_string();
-    // Remove markdown code fences
-    s = s.trim_start_matches("```sql").trim_start_matches("```").trim_end_matches("```").trim().to_string();
-    // Remove leading "sql" keyword on its own line
-    let lines: Vec<&str> = s.lines().collect();
-    if let Some(first) = lines.first() {
-        if first.trim().eq_ignore_ascii_case("sql") || first.trim().eq_ignore_ascii_case("sql;") {
-            s = lines[1..].join("\n").trim().to_string();
+    let s = raw.trim();
+
+    // Strategy 1: If there's a ```sql...``` block, extract just that
+    if let Some(start) = s.find("```sql") {
+        let after = &s[start + 6..];
+        if let Some(end) = after.find("```") {
+            return after[..end].trim().to_string();
        }
    }
-    // If the model added explanation after the SQL, keep only up to the first semicolon line
-    if let Some(pos) = s.find(";\n\n") {
-        s = s[..pos + 1].to_string();
+    if let Some(start) = s.find("```") {
+        let after = &s[start + 3..];
+        if let Some(end) = after.find("```") {
+            let inner = after[..end].trim();
+            // Skip leading "sql" keyword
+            let inner = inner.strip_prefix("sql").map(|s| s.trim_start()).unwrap_or(inner);
+            return inner.to_string();
+        }
    }
-    s
+
+    // Strategy 2: Find the first SELECT/WITH/INSERT/UPDATE/DELETE statement
+    let upper = s.to_uppercase();
+    for keyword in &["SELECT", "WITH", "INSERT", "UPDATE", "DELETE"] {
+        if let Some(pos) = upper.find(keyword) {
+            let sql_part = &s[pos..];
+            // Take up to the first semicolon (or end)
+            let end = sql_part.find(';').map(|p| p + 1).unwrap_or(sql_part.len());
+            return sql_part[..end].trim().to_string();
+        }
+    }
+
+    // Strategy 3: Strip leading "sql" and clean up
+    let mut result = s.to_string();
+    let lines: Vec<&str> = result.lines().collect();
+    if let Some(first) = lines.first() {
+        if first.trim().eq_ignore_ascii_case("sql") {
+            result = lines[1..].join("\n").trim().to_string();
+        }
+    }
+    result
 }

 fn format_cell(val: Option<&serde_json::Value>) -> String {
--- a/data/_catalog/manifests/3b12ae24-17d4-4325-92bf-f3155982f3bf.json
+++ b/data/_catalog/manifests/3b12ae24-17d4-4325-92bf-f3155982f3bf.json
@ -0,0 +1,65 @@
+{
+  "id": "3b12ae24-17d4-4325-92bf-f3155982f3bf",
+  "name": "users",
+  "schema_fingerprint": "30c0e31f0963e6f4af02131bbb9ea246fbbd068b849b833565a4b28211fbc90b",
+  "objects": [
+    {
+      "bucket": "data",
+      "key": "datasets/users.parquet",
+      "size_bytes": 2012,
+      "created_at": "2026-03-28T01:38:59.904968123Z"
+    }
+  ],
+  "created_at": "2026-03-28T01:38:59.904968680Z",
+  "updated_at": "2026-03-28T01:38:59.905221340Z",
+  "description": "",
+  "owner": "",
+  "sensitivity": null,
+  "columns": [
+    {
+      "name": "id",
+      "data_type": "Int32",
+      "sensitivity": null,
+      "description": "",
+      "is_pii": false
+    },
+    {
+      "name": "username",
+      "data_type": "Utf8",
+      "sensitivity": null,
+      "description": "",
+      "is_pii": false
+    },
+    {
+      "name": "password_hash",
+      "data_type": "Utf8",
+      "sensitivity": null,
+      "description": "",
+      "is_pii": false
+    },
+    {
+      "name": "role",
+      "data_type": "Utf8",
+      "sensitivity": null,
+      "description": "",
+      "is_pii": false
+    },
+    {
+      "name": "created_at",
+      "data_type": "Utf8",
+      "sensitivity": null,
+      "description": "",
+      "is_pii": false
+    }
+  ],
+  "lineage": {
+    "source_system": "postgresql",
+    "source_file": "127.0.0.1:5432/knowledge_base.users",
+    "ingest_job": "pg-import-1774661939904",
+    "ingest_timestamp": "2026-03-28T01:38:59.904968123Z",
+    "parent_datasets": []
+  },
+  "freshness": null,
+  "tags": [],
+  "row_count": 1
+}
--- a/data/_catalog/manifests/3c2579b4-f3f3-4875-95fa-58d8b49ad94c.json
+++ b/data/_catalog/manifests/3c2579b4-f3f3-4875-95fa-58d8b49ad94c.json
@ -0,0 +1,93 @@
+{
+  "id": "3c2579b4-f3f3-4875-95fa-58d8b49ad94c",
+  "name": "meta_runs",
+  "schema_fingerprint": "68f2c0d7a3ceb0aaa3c17c64900704519c72d213161bc9e5179c42ee53f6d0df",
+  "objects": [
+    {
+      "bucket": "data",
+      "key": "datasets/meta_runs.parquet",
+      "size_bytes": 729773,
+      "created_at": "2026-03-28T01:38:57.380576453Z"
+    }
+  ],
+  "created_at": "2026-03-28T01:38:57.380577270Z",
+  "updated_at": "2026-03-28T01:38:57.380846224Z",
+  "description": "",
+  "owner": "",
+  "sensitivity": null,
+  "columns": [
+    {
+      "name": "id",
+      "data_type": "Int32",
+      "sensitivity": null,
+      "description": "",
+      "is_pii": false
+    },
+    {
+      "name": "pipeline_id",
+      "data_type": "Int32",
+      "sensitivity": null,
+      "description": "",
+      "is_pii": false
+    },
+    {
+      "name": "iteration",
+      "data_type": "Int32",
+      "sensitivity": null,
+      "description": "",
+      "is_pii": false
+    },
+    {
+      "name": "stage_results",
+      "data_type": "Utf8",
+      "sensitivity": null,
+      "description": "",
+      "is_pii": false
+    },
+    {
+      "name": "final_output",
+      "data_type": "Utf8",
+      "sensitivity": null,
+      "description": "",
+      "is_pii": false
+    },
+    {
+      "name": "score",
+      "data_type": "Float64",
+      "sensitivity": null,
+      "description": "",
+      "is_pii": false
+    },
+    {
+      "name": "model_config",
+      "data_type": "Utf8",
+      "sensitivity": null,
+      "description": "",
+      "is_pii": false
+    },
+    {
+      "name": "duration_ms",
+      "data_type": "Int32",
+      "sensitivity": null,
+      "description": "",
+      "is_pii": false
+    },
+    {
+      "name": "created_at",
+      "data_type": "Utf8",
+      "sensitivity": null,
+      "description": "",
+      "is_pii": false
+    }
+  ],
+  "lineage": {
+    "source_system": "postgresql",
+    "source_file": "127.0.0.1:5432/knowledge_base.meta_runs",
+    "ingest_job": "pg-import-1774661937380",
+    "ingest_timestamp": "2026-03-28T01:38:57.380576453Z",
+    "parent_datasets": []
+  },
+  "freshness": null,
+  "tags": [],
+  "row_count": 38
+}
--- a/data/datasets/meta_runs.parquet
+++ b/data/datasets/meta_runs.parquet
--- a/data/datasets/users.parquet
+++ b/data/datasets/users.parquet