Fix: job tracker field name mismatch — the overnight killer

ROOT CAUSE: Python scripts polled status.get("processed", 0) but the
Rust Job struct serialized as "embedded_chunks". Scripts always saw 0,
looped forever printing "unknown: 0/50000" for 8+ hours.

Fix (both sides):
- Rust: added "processed" alias field + "total" field to Job struct,
  kept in sync on every update_progress() and complete() call
- Python: fixed autonomous_agent.py and overnight_proof.sh to read
  "embedded_chunks" as primary key

The actual embedding pipeline was working the whole time — 673K real
chunks embedded overnight. Only the monitoring was blind.

One-word bug, 8 hours of zombie output. This is why you test the
monitoring, not just the pipeline.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
root 2026-04-17 10:41:32 -05:00
parent 0bd48771ff
commit 1565f536eb
3 changed files with 11 additions and 2 deletions

View File

@ -22,7 +22,12 @@ pub struct Job {
pub status: JobStatus,
pub index_name: String,
pub total_chunks: usize,
/// How many chunks have been embedded so far.
/// Also serialized as "processed" for backward-compat with monitoring scripts.
pub embedded_chunks: usize,
#[serde(rename = "processed")]
pub processed_alias: usize,
pub total: usize,
pub progress_pct: f32,
pub storage_key: Option<String>,
pub error: Option<String>,
@ -53,6 +58,8 @@ impl JobTracker {
index_name: index_name.to_string(),
total_chunks,
embedded_chunks: 0,
processed_alias: 0,
total: total_chunks,
progress_pct: 0.0,
storage_key: None,
error: None,
@ -69,6 +76,7 @@ impl JobTracker {
let mut jobs = self.jobs.write().await;
if let Some(job) = jobs.get_mut(id) {
job.embedded_chunks = embedded;
job.processed_alias = embedded; // keep alias in sync
job.progress_pct = if job.total_chunks > 0 {
(embedded as f32 / job.total_chunks as f32) * 100.0
} else {
@ -84,6 +92,7 @@ impl JobTracker {
if let Some(job) = jobs.get_mut(id) {
job.status = JobStatus::Completed;
job.embedded_chunks = job.total_chunks;
job.processed_alias = job.total_chunks;
job.progress_pct = 100.0;
job.storage_key = Some(storage_key);
job.completed_at = Some(chrono::Utc::now().to_rfc3339());

View File

@ -202,7 +202,7 @@ def phase_embed(runner: PlaybookRunner, errors: ErrorPipeline):
time.sleep(5)
status = get(f"/vectors/jobs/{job_id}")
state = status.get("status", "unknown")
progress = status.get("processed", 0)
progress = status.get("embedded_chunks", 0)
if state == "completed":
runner.record("embedding complete", True, f"{progress} chunks embedded")
break

View File

@ -101,7 +101,7 @@ for batch_start in range(0, len(docs), BATCH):
status = post(f"/vectors/jobs/{job_id}", None) if job_id else {"status": "unknown"}
if isinstance(status, dict):
state = status.get("status", "unknown")
progress = status.get("processed", 0)
progress = status.get("embedded_chunks", 0)
if state == "completed":
elapsed = time.time() - t0
rate = chunks / elapsed if elapsed > 0 else 0