lakehouse/crates/vectord/src/doc_drift.rs

// Phase 45 slice 3 — context7 bridge client for doc-drift detection.
//
// Calls the Bun context7 bridge on :3900 (mcp-server/context7_bridge.ts)
// which itself wraps context7's public API. For each DocRef on a
// playbook, queries /docs/:tool/diff?since=<snippet_hash> and parses
// the response.
//
// Kept deliberately thin: no caching here (bridge caches for 5 min),
// no retry logic (transient fail = report as "unknown" drift status,
// don't flag). The handler decides whether to flag the playbook based
// on whether ANY tool came back drifted=true.

use serde::Deserialize;
use std::time::Duration;

use crate::playbook_memory::DocRef;

const DEFAULT_BRIDGE_URL: &str = "http://localhost:3900";
const CALL_TIMEOUT_SECS: u64 = 15;

#[derive(Debug, Clone)]
pub struct DriftCheck {
    pub tool: String,
    pub version_seen: String,
    pub outcome: DriftOutcome,
}

#[derive(Debug, Clone)]
pub enum DriftOutcome {
    /// Bridge reports drift (current snippet_hash != previous).
    Drifted { current_snippet_hash: String, source_url: Option<String> },
    /// Bridge reports no drift — the stored snippet_hash still matches
    /// the current context7 docs.
    Unchanged,
    /// Bridge unreachable, 404 on the tool, or returned unparseable
    /// data. We deliberately don't flag on these — a down bridge
    /// shouldn't silently mark every playbook drift-flagged.
    Unknown { reason: String },
}

#[derive(Debug, Clone, Deserialize)]
struct BridgeDiffResponse {
    drifted: bool,
    current_snippet_hash: Option<String>,
    source_url: Option<String>,
}

pub struct DriftCheckerConfig {
    pub bridge_url: String,
}

impl Default for DriftCheckerConfig {
    fn default() -> Self {
        Self {
            bridge_url: std::env::var("LH_BRIDGE_URL")
                .unwrap_or_else(|_| DEFAULT_BRIDGE_URL.to_string()),
        }
    }
}

/// For every doc_ref, ask the bridge whether it drifted against the
/// recorded snippet_hash. Returns per-tool outcomes.
pub async fn check_all_refs(
    cfg: &DriftCheckerConfig,
    doc_refs: &[DocRef],
) -> Vec<DriftCheck> {
    let client = reqwest::Client::builder()
        .timeout(Duration::from_secs(CALL_TIMEOUT_SECS))
        .build()
        .expect("reqwest client build");

    let mut out = Vec::with_capacity(doc_refs.len());
    for r in doc_refs {
        let hash = r.snippet_hash.as_deref().unwrap_or("");
        if hash.is_empty() {
            // No hash to compare against — can't detect drift. Report
            // unknown so the caller isn't forced to flag.
            out.push(DriftCheck {
                tool: r.tool.clone(),
                version_seen: r.version_seen.clone(),
                outcome: DriftOutcome::Unknown {
                    reason: "no snippet_hash recorded on doc_ref".into(),
                },
            });
            continue;
        }
        let url = format!(
            "{}/docs/{}/diff?since={}",
            cfg.bridge_url.trim_end_matches('/'),
            urlencoding_minimal(&r.tool),
            urlencoding_minimal(hash),
        );
        let outcome = match client.get(&url).send().await {
            Err(e) => DriftOutcome::Unknown {
                reason: format!("bridge unreachable: {}", e),
            },
            Ok(resp) => {
                if resp.status() == reqwest::StatusCode::NOT_FOUND {
                    DriftOutcome::Unknown {
                        reason: format!("bridge 404 — no context7 library for tool '{}'", r.tool),
                    }
                } else if !resp.status().is_success() {
                    let status = resp.status();
                    DriftOutcome::Unknown {
                        reason: format!("bridge {}: {}", status, resp.text().await.unwrap_or_default()),
                    }
                } else {
                    match resp.json::<BridgeDiffResponse>().await {
                        Err(e) => DriftOutcome::Unknown {
                            reason: format!("bridge response parse: {}", e),
                        },
                        Ok(body) => {
                            if body.drifted {
                                DriftOutcome::Drifted {
                                    current_snippet_hash: body.current_snippet_hash.unwrap_or_default(),
                                    source_url: body.source_url,
                                }
                            } else {
                                DriftOutcome::Unchanged
                            }
                        }
                    }
                }
            }
        };
        out.push(DriftCheck {
            tool: r.tool.clone(),
            version_seen: r.version_seen.clone(),
            outcome,
        });
    }
    out
}

/// Minimal URL path+query encoder for ASCII tool names. We're not
/// pulling in the `urlencoding` crate for this single use — tool
/// names are short alphanumeric (docker/terraform/react), snippet
/// hashes are hex. Handles space + `/` + `?` + `&` defensively so
/// unusual tool names don't corrupt the URL.
fn urlencoding_minimal(s: &str) -> String {
    let mut out = String::with_capacity(s.len());
    for b in s.bytes() {
        match b {
            b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'-' | b'_' | b'.' | b'~' => out.push(b as char),
            _ => out.push_str(&format!("%{:02X}", b)),
        }
    }
    out
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn urlencoding_handles_ascii_safe_chars_passthrough() {
        assert_eq!(urlencoding_minimal("docker"), "docker");
        assert_eq!(urlencoding_minimal("next.js"), "next.js");
    }

    #[test]
    fn urlencoding_encodes_slash_and_space() {
        assert_eq!(urlencoding_minimal("foo/bar"), "foo%2Fbar");
        assert_eq!(urlencoding_minimal("foo bar"), "foo%20bar");
    }
}