golangLAKEHOUSE/internal/chat/ollama_cloud.go
root 05273ac06b phase 4: chatd — multi-provider LLM dispatcher (ollama / cloud / openrouter / opencode / kimi)
new cmd/chatd on :3220 routes /v1/chat to the right provider based
on model-name prefix or :cloud suffix. closes the architectural gap
named in lakehouse.toml [models]: tiers map to model IDs, but until
phase 4 there was no service that could actually CALL those models
from go.

routing rules (registry.Resolve):
  ollama/<m>          → local Ollama (prefix stripped)
  ollama_cloud/<m>    → Ollama Cloud
  <m>:cloud           → Ollama Cloud (suffix variant — kimi-k2.6:cloud)
  openrouter/<v>/<m>  → OpenRouter (prefix stripped, OpenAI-compat)
  opencode/<m>        → OpenCode unified Zen+Go
  kimi/<m>            → Kimi For Coding (api.kimi.com/coding/v1)
  bare names          → local Ollama (default)

provider implementations:
- internal/chat/types.go      Provider interface, Request/Response, errors
- internal/chat/registry.go   prefix + :cloud suffix dispatch
- internal/chat/ollama.go     local Ollama via /api/chat (think=false default)
- internal/chat/ollama_cloud.go  Ollama Cloud via /api/generate (Bearer auth)
- internal/chat/openai_compat.go shared OpenAI Chat Completions for the
                                 OpenRouter/OpenCode/Kimi family
- internal/chat/builder.go    BuildRegistry from BuilderInput;
                              ResolveKey reads env then .env file fallback

config:
- ChatdConfig in internal/shared/config.go with bind, ollama_url,
  per-provider key env names + .env fallback paths, timeout
- Gateway gains chatd_url + /v1/chat + /v1/chat/* routes
- lakehouse.toml [chatd] block with /etc/lakehouse/<provider>.env defaults

tests (19 in internal/chat):
- registry: prefix + :cloud + errors + telemetry + provider listing
- ollama: happy path + prefix strip + format=json + 500 mapping +
  flatten_messages
- openai_compat: happy path + format=json + 429 mapping + zero-choices

think=false default in ollama + ollama_cloud — local hot path skips
reasoning, low-budget callers (the playbook_lift judge at max_tokens=10)
get direct answers instead of empty content + done_reason=length.
proven via chatd_smoke acceptance.

acceptance gate: scripts/chatd_smoke.sh — 6/6 PASS:
1. /v1/chat/providers lists exactly registered providers (1 in dev mode)
2. bare model → ollama default with content + token counts + latency
3. explicit ollama/<m> → prefix stripped at upstream
4. <m>:cloud without ollama_cloud registered → 404 (no silent fall-through)
5. unknown/<m> → falls through to default → upstream 502 (no prefix rewrite)
6. missing model field → 400

just verify: PASS (vet + 30 packages × short tests + 9 smokes).
chatd_smoke is a domain smoke (not in just verify, mirrors matrix /
observer / pathway pattern).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-30 00:08:29 -05:00

149 lines
4.2 KiB
Go

package chat
import (
"bytes"
"context"
"encoding/json"
"errors"
"fmt"
"io"
"net/http"
"strings"
"time"
)
// OllamaCloud — Ollama Cloud (Pro plan as of 2026-04-28). Bearer
// auth via OLLAMA_CLOUD_KEY. Wire format mirrors the Rust adapter
// (crates/gateway/src/v1/ollama_cloud.rs) — system/prompt split
// against /api/generate with cloud-friendly num_predict floor.
//
// Model routing: both "cloud/<model>" (prefix) and "<model>:cloud"
// (suffix) route here. The registry handles the suffix case; this
// provider only sees the prefix form. Suffix models pass through
// the StripPrefix call unchanged (no leading "cloud/" to strip),
// which is what the upstream wants.
type OllamaCloud struct {
apiKey string
baseURL string
httpClient *http.Client
}
func NewOllamaCloud(apiKey string, timeout time.Duration) *OllamaCloud {
if timeout == 0 {
timeout = 180 * time.Second
}
return &OllamaCloud{
apiKey: apiKey,
baseURL: "https://ollama.com",
httpClient: &http.Client{Timeout: timeout},
}
}
func (o *OllamaCloud) Name() string { return "ollama_cloud" }
func (o *OllamaCloud) Available() bool { return o.apiKey != "" }
func (o *OllamaCloud) Chat(ctx context.Context, req Request) (*Response, error) {
model := StripPrefix(req.Model, "cloud")
// "<model>:cloud" suffix passes through unchanged — that's the
// canonical name on ollama.com.
system, prompt := flattenMessages(req.Messages)
body := map[string]any{
"model": model,
"prompt": prompt,
"stream": false,
// Mirror local Ollama — skip reasoning by default. Callers that
// need thinking explicitly (e.g. for Kimi K2 long reasoning)
// will get a future Request.Think field; the v0 default keeps
// outputs short and predictable.
"think": false,
"options": map[string]any{
// Cloud reasoning models need headroom — 400 floor matches
// the Rust adapter's policy for kimi-k2.6 / gpt-oss:120b.
"num_predict": maxInt(req.MaxTokens, 400),
"temperature": defaultFloat(req.Temperature, 0.3),
},
}
if system != "" {
body["system"] = system
}
if req.Format == "json" {
body["format"] = "json"
}
bs, _ := json.Marshal(body)
httpReq, err := http.NewRequestWithContext(ctx, "POST", o.baseURL+"/api/generate", bytes.NewReader(bs))
if err != nil {
return nil, err
}
httpReq.Header.Set("Content-Type", "application/json")
httpReq.Header.Set("Authorization", "Bearer "+o.apiKey)
resp, err := o.httpClient.Do(httpReq)
if err != nil {
if errors.Is(ctx.Err(), context.DeadlineExceeded) {
return nil, fmt.Errorf("%w: ollama_cloud", ErrTimeout)
}
return nil, fmt.Errorf("ollama_cloud: %w", err)
}
defer resp.Body.Close()
rb, _ := io.ReadAll(resp.Body)
if resp.StatusCode/100 != 2 {
return nil, fmt.Errorf("%w: ollama_cloud %d: %s", ErrUpstream, resp.StatusCode, abbrev(string(rb), 300))
}
var cloudResp struct {
Model string `json:"model"`
Response string `json:"response"`
Done bool `json:"done"`
PromptEvalCount int `json:"prompt_eval_count"`
EvalCount int `json:"eval_count"`
}
if err := json.Unmarshal(rb, &cloudResp); err != nil {
return nil, fmt.Errorf("ollama_cloud decode: %w (body=%s)", err, abbrev(string(rb), 200))
}
return &Response{
Model: model,
Content: cloudResp.Response,
InputTokens: cloudResp.PromptEvalCount,
OutputTokens: cloudResp.EvalCount,
FinishReason: finishReasonFromDone(cloudResp.Done),
}, nil
}
// flattenMessages splits a Message list into Ollama Cloud's
// /api/generate shape (single system + concatenated prompt).
// Mirrors the Rust adapter's flatten_messages_public.
func flattenMessages(messages []Message) (system, prompt string) {
var sysParts []string
var promptParts []string
for _, m := range messages {
switch m.Role {
case "system":
sysParts = append(sysParts, m.Content)
case "assistant":
promptParts = append(promptParts, "Assistant: "+m.Content)
default: // "user" or anything else
promptParts = append(promptParts, "User: "+m.Content)
}
}
return strings.Join(sysParts, "\n\n"), strings.Join(promptParts, "\n\n")
}
func maxInt(a, b int) int {
if a > b {
return a
}
return b
}
func defaultFloat(v, fallback float64) float64 {
if v == 0 {
return fallback
}
return v
}