golangLAKEHOUSE/internal/shared/langfuse_middleware.go

package shared

import (
	"context"
	"net/http"
	"os"
	"time"

	"git.agentview.dev/profit/golangLAKEHOUSE/internal/langfuse"
)

// TraceIDHeader propagates a Langfuse trace id across services. When
// validatord makes a /v1/iterate call that internally calls chatd's
// /v1/chat, validatord sends this header so both daemons' middleware
// emit spans under the SAME trace tree (rather than two unrelated
// traces). Closes the multi-call observability gap J flagged
// 2026-05-02 ("we need to make sure they have the corpus of
// information to complete the process and we want to spot errors").
const TraceIDHeader = "X-Lakehouse-Trace-Id"

// traceIDCtxKey is the context value key for the per-request trace id.
// Handlers downstream of langfuseMiddleware can pull it via TraceIDFromCtx
// to attach child spans (e.g. iteration-attempt spans inside validatord).
type traceIDCtxKey struct{}

// TraceIDFromCtx returns the per-request Langfuse trace id, or "" if
// the middleware didn't set one (Langfuse not configured / /health
// bypass / no Client wired).
func TraceIDFromCtx(ctx context.Context) string {
	if v, ok := ctx.Value(traceIDCtxKey{}).(string); ok {
		return v
	}
	return ""
}

// langfuseMiddleware emits one Langfuse trace per HTTP request, with
// a single span carrying start/end timestamps + status code. Per
// OPEN item #2 (closed by the wave that adds this file): production
// traffic gets free trace visibility without per-handler wiring.
//
// nil client → returns a passthrough no-op middleware so callers
// don't need a nil check in shared.Run. Same fail-open posture as
// Langfuse's queue layer (per ADR-005 Decision 5.1: observability
// is a witness, never a gate).
//
// /health bypasses tracing — operators don't want every LB probe
// or monitor heartbeat polluting traces. Real traffic surfaces
// only via the registered routes.
func langfuseMiddleware(serviceName string, lf *langfuse.Client) func(http.Handler) http.Handler {
	if lf == nil {
		return func(next http.Handler) http.Handler { return next }
	}
	return func(next http.Handler) http.Handler {
		return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
			// /health bypasses tracing — same exemption logic as
			// the auth middleware (see RequireAuth).
			if r.URL.Path == "/health" {
				next.ServeHTTP(w, r)
				return
			}
			start := time.Now()
			sw := &statusWriter{ResponseWriter: w, status: http.StatusOK}

			// If the caller forwarded a trace id (cross-service parent
			// linkage) reuse it instead of starting a new trace. Spans
			// from this service then attach to the parent trace tree
			// so an /v1/iterate session shows as one trace with
			// children for each /v1/chat hop.
			traceID := r.Header.Get(TraceIDHeader)
			if traceID == "" {
				traceID = lf.Trace(r.Context(), langfuse.TraceInput{
					Name: serviceName + " " + r.Method + " " + r.URL.Path,
					Tags: []string{serviceName, r.Method},
					Metadata: map[string]any{
						"path":        r.URL.Path,
						"method":      r.Method,
						"remote_addr": r.RemoteAddr,
					},
				})
			}

			// Stash the trace id on the request context so downstream
			// handlers can attach finer-grained spans (e.g. one per
			// iteration attempt inside validator.Iterate).
			r = r.WithContext(context.WithValue(r.Context(), traceIDCtxKey{}, traceID))

			next.ServeHTTP(sw, r)

			level := ""
			if sw.status >= 500 {
				level = "ERROR"
			} else if sw.status >= 400 {
				level = "WARNING"
			}
			lf.Span(r.Context(), langfuse.SpanInput{
				TraceID: traceID,
				Name:    "http.request",
				Input: map[string]any{
					"method":      r.Method,
					"path":        r.URL.Path,
					"remote_addr": r.RemoteAddr,
				},
				Output: map[string]any{
					"status":      sw.status,
					"duration_ms": time.Since(start).Milliseconds(),
				},
				StartTime:  start,
				EndTime:    time.Now(),
				StatusCode: sw.status,
				Level:      level,
			})
		})
	}
}

// statusWriter is the standard "wrap http.ResponseWriter to capture
// the status code" trick. WriteHeader is the only method that
// changes status; any handler that doesn't call WriteHeader gets
// the implicit 200 from our struct's default.
type statusWriter struct {
	http.ResponseWriter
	status int
}

func (sw *statusWriter) WriteHeader(code int) {
	sw.status = code
	sw.ResponseWriter.WriteHeader(code)
}

// LoadLangfuseFromEnv builds a langfuse.Client from environment
// variables. Returns nil if any of LANGFUSE_URL / LANGFUSE_PUBLIC_KEY
// / LANGFUSE_SECRET_KEY is unset (best-effort: missing config means
// no tracing, never a startup error). Same env names as the bare
// /etc/lakehouse/langfuse.env file used by the multi_coord_stress
// driver — operators ship one env file across every daemon.
//
// Exported 2026-05-02 so daemons that need to emit application-level
// child spans (validatord's iterate-attempt spans) can hold their own
// reference to the same client `shared.Run` is already wiring into
// the middleware.
func LoadLangfuseFromEnv() *langfuse.Client {
	url := os.Getenv("LANGFUSE_URL")
	pk := os.Getenv("LANGFUSE_PUBLIC_KEY")
	sk := os.Getenv("LANGFUSE_SECRET_KEY")
	if url == "" || pk == "" || sk == "" {
		return nil
	}
	return langfuse.New(url, pk, sk, nil)
}