golangLAKEHOUSE/internal/replay/replay.go

package replay

import (
	"context"
	"crypto/sha256"
	"encoding/hex"
	"encoding/json"
	"fmt"
	"os"
	"path/filepath"
	"time"
)

// DefaultRoot is what the CLI uses when --root isn't passed.
func DefaultRoot() string {
	if r := os.Getenv("LH_DISTILL_ROOT"); r != "" {
		return r
	}
	if cwd, err := os.Getwd(); err == nil {
		return cwd
	}
	return "/home/profit/lakehouse"
}

// Replay runs the retrieve→prompt→model→validate→log pipeline.
// Returns a ReplayResult that's already been appended to
// data/_kb/replay_runs.jsonl unless DryRun + the file is read-only.
//
// Errors here are *infrastructure* failures (corpus unreadable, log
// write failed). A failed model call OR a failed validation gate is
// captured in ReplayResult.ValidationResult, not returned as error —
// callers can branch on Passed / EscalationPath.
func Replay(ctx context.Context, opts ReplayRequest, root string) (ReplayResult, error) {
	t0 := time.Now()
	recordedAt := time.Now().UTC().Format(time.RFC3339Nano)

	taskHash := sha256Hex(opts.Task)

	corpus, err := LoadRagCorpus(root)
	if err != nil {
		return ReplayResult{}, fmt.Errorf("load rag corpus: %w", err)
	}

	var bundle *ContextBundle
	if !opts.NoRetrieval {
		bundle = BuildContextBundle(corpus, opts.Task)
	}
	prompt := BuildPrompt(opts.Task, bundle)

	localModel := orDefault(opts.LocalModel, DefaultLocalModel)
	escalationModel := orDefault(opts.EscalationModel, DefaultEscalationModel)
	gatewayURL := orDefault(opts.GatewayURL, gatewayFromEnv())

	caller := httpModelCaller(gatewayURL)
	if opts.DryRun {
		caller = dryRunCaller(opts.Task, bundle)
	}

	escalation := []string{localModel}
	modelUsed := localModel
	var modelResponse string
	var validation ValidationResult

	localCall := caller(ctx, localModel, prompt.System, prompt.User)
	if localCall.OK {
		modelResponse = localCall.Content
		validation = ValidateResponse(modelResponse, bundle)
	} else {
		validation = ValidationResult{
			Passed:  false,
			Reasons: []string{"local call failed: " + localCall.Error},
		}
	}

	if !validation.Passed && opts.AllowEscalation && !opts.LocalOnly {
		escalation = append(escalation, escalationModel)
		escalCall := caller(ctx, escalationModel, prompt.System, prompt.User)
		if escalCall.OK {
			modelResponse = escalCall.Content
			modelUsed = escalationModel
			validation = ValidateResponse(modelResponse, bundle)
			if validation.Passed {
				validation.Reasons = append([]string{"recovered via escalation to " + escalationModel}, validation.Reasons...)
			}
		} else {
			validation.Reasons = append(validation.Reasons, "escalation also failed: "+escalCall.Error)
		}
	}

	recordedRunID := fmt.Sprintf("replay:%s:%s",
		taskHash[:16],
		sha256Hex(recordedAt)[:12],
	)
	result := ReplayResult{
		InputTask:          opts.Task,
		TaskHash:           taskHash,
		RetrievedArtifacts: RetrievedIDs{RagIDs: ragIDs(bundle)},
		ContextBundle:      bundle,
		ModelResponse:      modelResponse,
		ModelUsed:          modelUsed,
		EscalationPath:     escalation,
		ValidationResult:   validation,
		RecordedRunID:      recordedRunID,
		RecordedAt:         recordedAt,
		DurationMs:         time.Since(t0).Milliseconds(),
	}

	if err := logReplayEvidence(root, result); err != nil {
		// Logging failure is real — surface it. The caller still gets the
		// in-memory result so they can inspect what happened.
		return result, fmt.Errorf("log replay evidence: %w", err)
	}
	return result, nil
}

// dryRunCaller wraps dryRunSynthesize as a ModelCaller. The escalation
// branch in Replay calls the caller a second time; for parity with TS,
// we return the same content suffixed with [ESCALATED] so a smoke can
// detect escalation in dry-run mode.
func dryRunCaller(task string, bundle *ContextBundle) ModelCaller {
	calls := 0
	return func(_ context.Context, _ string, _ string, _ string) callModelResult {
		calls++
		content := dryRunSynthesize(task, bundle)
		if calls >= 2 {
			content += "\n\n[ESCALATED]"
		}
		return callModelResult{Content: content, OK: true}
	}
}

// logReplayEvidence appends one row to data/_kb/replay_runs.jsonl.
// model_response is truncated to 4000 chars in the persisted log to
// keep the file lean (matches TS behavior).
func logReplayEvidence(root string, result ReplayResult) error {
	path := filepath.Join(root, "data", "_kb", "replay_runs.jsonl")
	if err := os.MkdirAll(filepath.Dir(path), 0o755); err != nil {
		return err
	}

	persist := struct {
		Schema string `json:"schema"`
		ReplayResult
	}{
		Schema:       "replay_run.v1",
		ReplayResult: result,
	}
	persist.ReplayResult.ModelResponse = trim(persist.ReplayResult.ModelResponse, 4000)

	buf, err := json.Marshal(persist)
	if err != nil {
		return err
	}
	buf = append(buf, '\n')

	f, err := os.OpenFile(path, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0o644)
	if err != nil {
		return err
	}
	defer f.Close()
	_, err = f.Write(buf)
	return err
}

func ragIDs(bundle *ContextBundle) []string {
	if bundle == nil {
		return []string{}
	}
	out := make([]string, 0, len(bundle.RetrievedPlaybooks))
	for _, p := range bundle.RetrievedPlaybooks {
		out = append(out, p.RagID)
	}
	return out
}

func sha256Hex(s string) string {
	h := sha256.Sum256([]byte(s))
	return hex.EncodeToString(h[:])
}

func gatewayFromEnv() string {
	if u := os.Getenv("LH_GATEWAY_URL"); u != "" {
		return u
	}
	return DefaultGatewayURL
}

func orDefault(v, fallback string) string {
	if v == "" {
		return fallback
	}
	return v
}