Fix: job tracker field name mismatch — the overnight killer
ROOT CAUSE: Python scripts polled status.get("processed", 0) but the
Rust Job struct serialized as "embedded_chunks". Scripts always saw 0,
looped forever printing "unknown: 0/50000" for 8+ hours.
Fix (both sides):
- Rust: added "processed" alias field + "total" field to Job struct,
kept in sync on every update_progress() and complete() call
- Python: fixed autonomous_agent.py and overnight_proof.sh to read
"embedded_chunks" as primary key
The actual embedding pipeline was working the whole time — 673K real
chunks embedded overnight. Only the monitoring was blind.
One-word bug, 8 hours of zombie output. This is why you test the
monitoring, not just the pipeline.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
0bd48771ff
commit
1565f536eb
@ -22,7 +22,12 @@ pub struct Job {
|
|||||||
pub status: JobStatus,
|
pub status: JobStatus,
|
||||||
pub index_name: String,
|
pub index_name: String,
|
||||||
pub total_chunks: usize,
|
pub total_chunks: usize,
|
||||||
|
/// How many chunks have been embedded so far.
|
||||||
|
/// Also serialized as "processed" for backward-compat with monitoring scripts.
|
||||||
pub embedded_chunks: usize,
|
pub embedded_chunks: usize,
|
||||||
|
#[serde(rename = "processed")]
|
||||||
|
pub processed_alias: usize,
|
||||||
|
pub total: usize,
|
||||||
pub progress_pct: f32,
|
pub progress_pct: f32,
|
||||||
pub storage_key: Option<String>,
|
pub storage_key: Option<String>,
|
||||||
pub error: Option<String>,
|
pub error: Option<String>,
|
||||||
@ -53,6 +58,8 @@ impl JobTracker {
|
|||||||
index_name: index_name.to_string(),
|
index_name: index_name.to_string(),
|
||||||
total_chunks,
|
total_chunks,
|
||||||
embedded_chunks: 0,
|
embedded_chunks: 0,
|
||||||
|
processed_alias: 0,
|
||||||
|
total: total_chunks,
|
||||||
progress_pct: 0.0,
|
progress_pct: 0.0,
|
||||||
storage_key: None,
|
storage_key: None,
|
||||||
error: None,
|
error: None,
|
||||||
@ -69,6 +76,7 @@ impl JobTracker {
|
|||||||
let mut jobs = self.jobs.write().await;
|
let mut jobs = self.jobs.write().await;
|
||||||
if let Some(job) = jobs.get_mut(id) {
|
if let Some(job) = jobs.get_mut(id) {
|
||||||
job.embedded_chunks = embedded;
|
job.embedded_chunks = embedded;
|
||||||
|
job.processed_alias = embedded; // keep alias in sync
|
||||||
job.progress_pct = if job.total_chunks > 0 {
|
job.progress_pct = if job.total_chunks > 0 {
|
||||||
(embedded as f32 / job.total_chunks as f32) * 100.0
|
(embedded as f32 / job.total_chunks as f32) * 100.0
|
||||||
} else {
|
} else {
|
||||||
@ -84,6 +92,7 @@ impl JobTracker {
|
|||||||
if let Some(job) = jobs.get_mut(id) {
|
if let Some(job) = jobs.get_mut(id) {
|
||||||
job.status = JobStatus::Completed;
|
job.status = JobStatus::Completed;
|
||||||
job.embedded_chunks = job.total_chunks;
|
job.embedded_chunks = job.total_chunks;
|
||||||
|
job.processed_alias = job.total_chunks;
|
||||||
job.progress_pct = 100.0;
|
job.progress_pct = 100.0;
|
||||||
job.storage_key = Some(storage_key);
|
job.storage_key = Some(storage_key);
|
||||||
job.completed_at = Some(chrono::Utc::now().to_rfc3339());
|
job.completed_at = Some(chrono::Utc::now().to_rfc3339());
|
||||||
|
|||||||
@ -202,7 +202,7 @@ def phase_embed(runner: PlaybookRunner, errors: ErrorPipeline):
|
|||||||
time.sleep(5)
|
time.sleep(5)
|
||||||
status = get(f"/vectors/jobs/{job_id}")
|
status = get(f"/vectors/jobs/{job_id}")
|
||||||
state = status.get("status", "unknown")
|
state = status.get("status", "unknown")
|
||||||
progress = status.get("processed", 0)
|
progress = status.get("embedded_chunks", 0)
|
||||||
if state == "completed":
|
if state == "completed":
|
||||||
runner.record("embedding complete", True, f"{progress} chunks embedded")
|
runner.record("embedding complete", True, f"{progress} chunks embedded")
|
||||||
break
|
break
|
||||||
|
|||||||
@ -101,7 +101,7 @@ for batch_start in range(0, len(docs), BATCH):
|
|||||||
status = post(f"/vectors/jobs/{job_id}", None) if job_id else {"status": "unknown"}
|
status = post(f"/vectors/jobs/{job_id}", None) if job_id else {"status": "unknown"}
|
||||||
if isinstance(status, dict):
|
if isinstance(status, dict):
|
||||||
state = status.get("status", "unknown")
|
state = status.get("status", "unknown")
|
||||||
progress = status.get("processed", 0)
|
progress = status.get("embedded_chunks", 0)
|
||||||
if state == "completed":
|
if state == "completed":
|
||||||
elapsed = time.time() - t0
|
elapsed = time.time() - t0
|
||||||
rate = chunks / elapsed if elapsed > 0 else 0
|
rate = chunks / elapsed if elapsed > 0 else 0
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user