From 1565f536ebca458d82bf7f91abe4c8c4d8e474ec Mon Sep 17 00:00:00 2001 From: root Date: Fri, 17 Apr 2026 10:41:32 -0500 Subject: [PATCH] =?UTF-8?q?Fix:=20job=20tracker=20field=20name=20mismatch?= =?UTF-8?q?=20=E2=80=94=20the=20overnight=20killer?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ROOT CAUSE: Python scripts polled status.get("processed", 0) but the Rust Job struct serialized as "embedded_chunks". Scripts always saw 0, looped forever printing "unknown: 0/50000" for 8+ hours. Fix (both sides): - Rust: added "processed" alias field + "total" field to Job struct, kept in sync on every update_progress() and complete() call - Python: fixed autonomous_agent.py and overnight_proof.sh to read "embedded_chunks" as primary key The actual embedding pipeline was working the whole time — 673K real chunks embedded overnight. Only the monitoring was blind. One-word bug, 8 hours of zombie output. This is why you test the monitoring, not just the pipeline. Co-Authored-By: Claude Opus 4.6 (1M context) --- crates/vectord/src/jobs.rs | 9 +++++++++ scripts/autonomous_agent.py | 2 +- scripts/overnight_proof.sh | 2 +- 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/crates/vectord/src/jobs.rs b/crates/vectord/src/jobs.rs index 3417d96..3934d07 100644 --- a/crates/vectord/src/jobs.rs +++ b/crates/vectord/src/jobs.rs @@ -22,7 +22,12 @@ pub struct Job { pub status: JobStatus, pub index_name: String, pub total_chunks: usize, + /// How many chunks have been embedded so far. + /// Also serialized as "processed" for backward-compat with monitoring scripts. pub embedded_chunks: usize, + #[serde(rename = "processed")] + pub processed_alias: usize, + pub total: usize, pub progress_pct: f32, pub storage_key: Option, pub error: Option, @@ -53,6 +58,8 @@ impl JobTracker { index_name: index_name.to_string(), total_chunks, embedded_chunks: 0, + processed_alias: 0, + total: total_chunks, progress_pct: 0.0, storage_key: None, error: None, @@ -69,6 +76,7 @@ impl JobTracker { let mut jobs = self.jobs.write().await; if let Some(job) = jobs.get_mut(id) { job.embedded_chunks = embedded; + job.processed_alias = embedded; // keep alias in sync job.progress_pct = if job.total_chunks > 0 { (embedded as f32 / job.total_chunks as f32) * 100.0 } else { @@ -84,6 +92,7 @@ impl JobTracker { if let Some(job) = jobs.get_mut(id) { job.status = JobStatus::Completed; job.embedded_chunks = job.total_chunks; + job.processed_alias = job.total_chunks; job.progress_pct = 100.0; job.storage_key = Some(storage_key); job.completed_at = Some(chrono::Utc::now().to_rfc3339()); diff --git a/scripts/autonomous_agent.py b/scripts/autonomous_agent.py index 358d54d..c6efd03 100644 --- a/scripts/autonomous_agent.py +++ b/scripts/autonomous_agent.py @@ -202,7 +202,7 @@ def phase_embed(runner: PlaybookRunner, errors: ErrorPipeline): time.sleep(5) status = get(f"/vectors/jobs/{job_id}") state = status.get("status", "unknown") - progress = status.get("processed", 0) + progress = status.get("embedded_chunks", 0) if state == "completed": runner.record("embedding complete", True, f"{progress} chunks embedded") break diff --git a/scripts/overnight_proof.sh b/scripts/overnight_proof.sh index abbec11..0b90949 100755 --- a/scripts/overnight_proof.sh +++ b/scripts/overnight_proof.sh @@ -101,7 +101,7 @@ for batch_start in range(0, len(docs), BATCH): status = post(f"/vectors/jobs/{job_id}", None) if job_id else {"status": "unknown"} if isinstance(status, dict): state = status.get("status", "unknown") - progress = status.get("processed", 0) + progress = status.get("embedded_chunks", 0) if state == "completed": elapsed = time.time() - t0 rate = chunks / elapsed if elapsed > 0 else 0