// mcpd is the Model Context Protocol server that exposes Lakehouse // capabilities as MCP tools. Replaces the Bun mcp-server.ts surface // (the MCP-tool-only subset; HTTP demo routes stay Bun until G5). // // Tools exposed: // list_datasets GET /v1/catalog/list // get_manifest GET /v1/catalog/manifest/ // query_sql POST /v1/sql // embed_text POST /v1/embed // search_vectors POST /v1/vectors/index//search // // Transport: StdioTransport (the universal MCP transport — Claude // Desktop, Claude Code, MCP CLI all speak this). Other transports // (SSE, HTTP) can be added later by changing the Run call. // // Setup for Claude Desktop / Claude Code: // bin/mcpd --gateway http://127.0.0.1:3110 // (configure your client to spawn this binary as an MCP server) // // Why not in cmd/gateway: separation of concerns. Gateway is HTTP // for direct-API callers; mcpd is stdio for MCP consumers. Keeping // them separate means each can be deployed / restarted / monitored // without affecting the other. package main import ( "bytes" "context" "encoding/json" "flag" "fmt" "io" "log" "net/http" "net/url" "strings" "time" "github.com/modelcontextprotocol/go-sdk/mcp" ) func main() { gatewayURL := flag.String("gateway", "http://127.0.0.1:3110", "Gateway URL (where mcpd proxies all tool calls)") flag.Parse() srv := buildServer(*gatewayURL) if err := srv.Run(context.Background(), &mcp.StdioTransport{}); err != nil { log.Fatalf("mcpd: %v", err) } } // buildServer assembles the MCP server with all tools wired against // the given gateway URL. Extracted from main() so tests can build // the server with a test gateway URL. func buildServer(gatewayURL string) *mcp.Server { srv := mcp.NewServer(&mcp.Implementation{ Name: "lakehouse", Version: "v0.1.0", }, nil) gw := newGatewayClient(gatewayURL) mcp.AddTool(srv, &mcp.Tool{ Name: "list_datasets", Description: "List all datasets registered in the catalog. " + "Returns dataset_id, name, schema_fingerprint, row_count " + "per dataset.", }, gw.listDatasets) mcp.AddTool(srv, &mcp.Tool{ Name: "get_manifest", Description: "Fetch the manifest for a single dataset by name. " + "Includes schema fingerprint, parquet object keys, row count, " + "created_at_unix_ns.", }, gw.getManifest) mcp.AddTool(srv, &mcp.Tool{ Name: "query_sql", Description: "Execute a SQL query against the registered datasets. " + "Returns columns + rows. SQL is interpreted by DuckDB; standard " + "SQL plus DuckDB-specific functions (read_parquet, etc.) work.", }, gw.querySQL) mcp.AddTool(srv, &mcp.Tool{ Name: "embed_text", Description: "Embed one or more texts via the configured embedding " + "model (default: nomic-embed-text). Returns one vector per text " + "in the same order as the input.", }, gw.embedText) mcp.AddTool(srv, &mcp.Tool{ Name: "search_vectors", Description: "Find the top-K nearest neighbors of a query vector " + "in the named HNSW index. K defaults to 10 if omitted.", }, gw.searchVectors) return srv } // gatewayClient holds the HTTP client + base URL for proxying tool // calls to the Go gateway. Per-tool latency is on the order of a // gateway round-trip; the 30s timeout accommodates the slowest // expected SQL query without holding stdio sessions indefinitely. type gatewayClient struct { baseURL string hc *http.Client } func newGatewayClient(baseURL string) *gatewayClient { return &gatewayClient{ baseURL: strings.TrimRight(baseURL, "/"), hc: &http.Client{Timeout: 30 * time.Second}, } } // ── tool argument structs (jsonschema tags drive schema generation) ── type listDatasetsArgs struct{} type getManifestArgs struct { Name string `json:"name" jsonschema:"the dataset name to fetch"` } type querySQLArgs struct { SQL string `json:"sql" jsonschema:"the SQL query to execute"` } type embedTextArgs struct { Texts []string `json:"texts" jsonschema:"the texts to embed"` Model string `json:"model,omitempty" jsonschema:"optional model name (defaults to embedd's configured default)"` } type searchVectorsArgs struct { IndexName string `json:"index_name" jsonschema:"the index to search"` Vector []float32 `json:"vector" jsonschema:"the query vector"` K int `json:"k,omitempty" jsonschema:"top-K to return (default 10)"` } // ── tool handlers ── func (g *gatewayClient) listDatasets(ctx context.Context, _ *mcp.CallToolRequest, _ listDatasetsArgs) (*mcp.CallToolResult, any, error) { body, err := g.proxy(ctx, http.MethodGet, "/v1/catalog/list", nil) if err != nil { return errorResult(err), nil, nil } return jsonResult(body), nil, nil } func (g *gatewayClient) getManifest(ctx context.Context, _ *mcp.CallToolRequest, args getManifestArgs) (*mcp.CallToolResult, any, error) { if args.Name == "" { return errorResult(fmt.Errorf("name is required")), nil, nil } path := "/v1/catalog/manifest/" + url.PathEscape(args.Name) body, err := g.proxy(ctx, http.MethodGet, path, nil) if err != nil { return errorResult(err), nil, nil } return jsonResult(body), nil, nil } func (g *gatewayClient) querySQL(ctx context.Context, _ *mcp.CallToolRequest, args querySQLArgs) (*mcp.CallToolResult, any, error) { if strings.TrimSpace(args.SQL) == "" { return errorResult(fmt.Errorf("sql is required")), nil, nil } reqBody, _ := json.Marshal(map[string]string{"sql": args.SQL}) body, err := g.proxy(ctx, http.MethodPost, "/v1/sql", reqBody) if err != nil { return errorResult(err), nil, nil } return jsonResult(body), nil, nil } func (g *gatewayClient) embedText(ctx context.Context, _ *mcp.CallToolRequest, args embedTextArgs) (*mcp.CallToolResult, any, error) { if len(args.Texts) == 0 { return errorResult(fmt.Errorf("texts is required")), nil, nil } payload := map[string]any{"texts": args.Texts} if args.Model != "" { payload["model"] = args.Model } reqBody, _ := json.Marshal(payload) body, err := g.proxy(ctx, http.MethodPost, "/v1/embed", reqBody) if err != nil { return errorResult(err), nil, nil } return jsonResult(body), nil, nil } func (g *gatewayClient) searchVectors(ctx context.Context, _ *mcp.CallToolRequest, args searchVectorsArgs) (*mcp.CallToolResult, any, error) { if args.IndexName == "" { return errorResult(fmt.Errorf("index_name is required")), nil, nil } if len(args.Vector) == 0 { return errorResult(fmt.Errorf("vector is required")), nil, nil } payload := map[string]any{"vector": args.Vector} if args.K > 0 { payload["k"] = args.K } reqBody, _ := json.Marshal(payload) path := "/v1/vectors/index/" + url.PathEscape(args.IndexName) + "/search" body, err := g.proxy(ctx, http.MethodPost, path, reqBody) if err != nil { return errorResult(err), nil, nil } return jsonResult(body), nil, nil } // proxy makes a request to the gateway and returns the response body // on success. Non-2xx status codes return an error with the body // preview in the message — surfaced to the MCP client as a tool error // rather than a transport-level failure. func (g *gatewayClient) proxy(ctx context.Context, method, path string, body []byte) ([]byte, error) { var rdr io.Reader if body != nil { rdr = bytes.NewReader(body) } req, err := http.NewRequestWithContext(ctx, method, g.baseURL+path, rdr) if err != nil { return nil, fmt.Errorf("build request: %w", err) } if body != nil { req.Header.Set("Content-Type", "application/json") } resp, err := g.hc.Do(req) if err != nil { return nil, fmt.Errorf("call gateway: %w", err) } defer resp.Body.Close() respBody, _ := io.ReadAll(io.LimitReader(resp.Body, 16<<20)) // 16 MiB tool-response cap if resp.StatusCode < 200 || resp.StatusCode >= 300 { preview := respBody if len(preview) > 512 { preview = preview[:512] } return nil, fmt.Errorf("gateway %s %s: status %d: %s", method, path, resp.StatusCode, string(preview)) } return respBody, nil } // errorResult wraps an error as an MCP tool error result. The MCP // protocol distinguishes "tool ran but reported failure" (returned // in CallToolResult.IsError + content) from "tool threw" (returned // as the third return value). We use the former so the LLM caller // sees the error text and can decide how to react, rather than // surfacing the error as transport noise. func errorResult(err error) *mcp.CallToolResult { return &mcp.CallToolResult{ IsError: true, Content: []mcp.Content{ &mcp.TextContent{Text: err.Error()}, }, } } // jsonResult wraps a JSON byte slice as a successful tool result. // The content is text — MCP clients render it; LLMs parse it as // JSON when their tool config indicates so. func jsonResult(body []byte) *mcp.CallToolResult { return &mcp.CallToolResult{ Content: []mcp.Content{ &mcp.TextContent{Text: string(body)}, }, } }