golangLAKEHOUSE/internal/chat/ollama_cloud.go

package chat

import (
	"bytes"
	"context"
	"encoding/json"
	"errors"
	"fmt"
	"io"
	"net/http"
	"strings"
	"time"
)

// OllamaCloud — Ollama Cloud (Pro plan as of 2026-04-28). Bearer
// auth via OLLAMA_CLOUD_KEY. Wire format mirrors the Rust adapter
// (crates/gateway/src/v1/ollama_cloud.rs) — system/prompt split
// against /api/generate with cloud-friendly num_predict floor.
//
// Model routing: "ollama_cloud/<model>" (prefix) and "<model>:cloud"
// (suffix) both route here. Registry.Resolve handles the suffix case;
// the prefix case strips "ollama_cloud/" before the upstream call so
// ollama.com sees the bare model name. Suffix-form models pass through
// StripPrefix unchanged (no leading prefix to strip — exactly what the
// upstream wants since "kimi-k2.6:cloud" is the canonical name on
// ollama.com).
type OllamaCloud struct {
	apiKey     string
	baseURL    string
	httpClient *http.Client
}

func NewOllamaCloud(apiKey string, timeout time.Duration) *OllamaCloud {
	if timeout == 0 {
		timeout = 180 * time.Second
	}
	return &OllamaCloud{
		apiKey:     apiKey,
		baseURL:    "https://ollama.com",
		httpClient: &http.Client{Timeout: timeout},
	}
}

func (o *OllamaCloud) Name() string    { return "ollama_cloud" }
func (o *OllamaCloud) Available() bool { return o.apiKey != "" }

func (o *OllamaCloud) Chat(ctx context.Context, req Request) (*Response, error) {
	// Strip "ollama_cloud/" prefix (Phase 4 scrum fix B-3 — Opus BLOCK).
	// Pre-fix used "cloud" which never matched, so upstream got the
	// prefixed model name and 400'd. ":cloud" suffix models pass
	// through unchanged — that's the canonical name on ollama.com.
	model := StripPrefix(req.Model, "ollama_cloud")

	system, prompt := flattenMessages(req.Messages)

	body := map[string]any{
		"model":  model,
		"prompt": prompt,
		"stream": false,
		// Mirror local Ollama — skip reasoning by default. Callers that
		// need thinking explicitly (e.g. for Kimi K2 long reasoning)
		// will get a future Request.Think field; the v0 default keeps
		// outputs short and predictable.
		"think": false,
		"options": map[string]any{
			// Cloud reasoning models need headroom — 400 floor matches
			// the Rust adapter's policy for kimi-k2.6 / gpt-oss:120b.
			"num_predict": maxInt(req.MaxTokens, 400),
			// Cloud-tier default 0.3 when caller didn't set; honor an
			// explicit value otherwise. Anthropic 4.7 deprecation
			// doesn't apply here (Ollama Cloud is its own server),
			// so always send.
			"temperature": tempPtrOr(req.Temperature, 0.3),
		},
	}
	if system != "" {
		body["system"] = system
	}
	if req.Format == "json" {
		body["format"] = "json"
	}

	bs, _ := json.Marshal(body)
	httpReq, err := http.NewRequestWithContext(ctx, "POST", o.baseURL+"/api/generate", bytes.NewReader(bs))
	if err != nil {
		return nil, err
	}
	httpReq.Header.Set("Content-Type", "application/json")
	httpReq.Header.Set("Authorization", "Bearer "+o.apiKey)

	resp, err := o.httpClient.Do(httpReq)
	if err != nil {
		if errors.Is(ctx.Err(), context.DeadlineExceeded) {
			return nil, fmt.Errorf("%w: ollama_cloud", ErrTimeout)
		}
		return nil, fmt.Errorf("ollama_cloud: %w", err)
	}
	defer resp.Body.Close()

	rb, _ := io.ReadAll(resp.Body)
	if resp.StatusCode/100 != 2 {
		return nil, fmt.Errorf("%w: ollama_cloud %d: %s", ErrUpstream, resp.StatusCode, abbrev(string(rb), 300))
	}

	var cloudResp struct {
		Model           string `json:"model"`
		Response        string `json:"response"`
		Done            bool   `json:"done"`
		DoneReason      string `json:"done_reason"`
		PromptEvalCount int    `json:"prompt_eval_count"`
		EvalCount       int    `json:"eval_count"`
	}
	if err := json.Unmarshal(rb, &cloudResp); err != nil {
		return nil, fmt.Errorf("ollama_cloud decode: %w (body=%s)", err, abbrev(string(rb), 200))
	}

	return &Response{
		Model:        model,
		Content:      cloudResp.Response,
		InputTokens:  cloudResp.PromptEvalCount,
		OutputTokens: cloudResp.EvalCount,
		FinishReason: finishReasonFromOllama(cloudResp.Done, cloudResp.DoneReason),
	}, nil
}

// flattenMessages splits a Message list into Ollama Cloud's
// /api/generate shape (single system + concatenated prompt).
// Mirrors the Rust adapter's flatten_messages_public.
func flattenMessages(messages []Message) (system, prompt string) {
	var sysParts []string
	var promptParts []string
	for _, m := range messages {
		switch m.Role {
		case "system":
			sysParts = append(sysParts, m.Content)
		case "assistant":
			promptParts = append(promptParts, "Assistant: "+m.Content)
		default: // "user" or anything else
			promptParts = append(promptParts, "User: "+m.Content)
		}
	}
	return strings.Join(sysParts, "\n\n"), strings.Join(promptParts, "\n\n")
}

func maxInt(a, b int) int {
	if a > b {
		return a
	}
	return b
}

func defaultFloat(v, fallback float64) float64 {
	if v == 0 {
		return fallback
	}
	return v
}

func tempPtrOr(v *float64, fallback float64) float64 {
	if v == nil {
		return fallback
	}
	return *v
}