ai-movie-pipeline/src/validators/scene_validator.py
profit 87d0af0748 Phase 1 implementation: script ingestion + AI extraction pipeline
Complete working pipeline from Fountain script to validated scene JSON:
- Schemas (Pydantic): all 7 layers defined upfront
- Fountain parser + normalizer (Layer 1)
- AI scene extractor with prompt contracts (Layer 2)
- Schema validator + scene-specific semantic validator
- Structured JSON logging per layer/scene execution
- Versioned output writer (never overwrites)
- Retry engine with 4-level failure escalation
- Stop condition evaluator (per-unit + global halts)
- Diff/drift detector for re-run comparison
- CLI entry point with --dry-run, --scene, --test, --force
- 3 test scripts (dialogue-heavy, action-heavy, nonstandard)
- Expected output files for regression testing

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-06 15:49:43 -07:00

93 lines
3.4 KiB
Python

"""Scene-specific validation — post-schema semantic checks for Layer 2 output."""
from dataclasses import dataclass
from typing import Literal
from difflib import SequenceMatcher
from src.schemas.scene_array import SceneArray
@dataclass
class ValidationWarning:
type: Literal[
"duplicate_scene_number",
"similar_character_names",
"scene_count_deviation",
"empty_characters",
"unknown_values",
]
scene_number: int | None
message: str
def validate_scenes(scene_array: SceneArray, heading_count: int) -> list[ValidationWarning]:
"""Run semantic validation checks on extracted scenes.
Args:
scene_array: Validated SceneArray from Layer 2.
heading_count: Number of scene_heading elements from Layer 1 output.
Returns:
List of warnings. Empty list means all checks passed.
"""
warnings: list[ValidationWarning] = []
# 1. Duplicate scene numbers (already caught by SceneArray validator, but double-check)
seen_numbers: dict[int, int] = {}
for scene in scene_array.scenes:
if scene.scene_number in seen_numbers:
warnings.append(ValidationWarning(
type="duplicate_scene_number",
scene_number=scene.scene_number,
message=f"Scene number {scene.scene_number} appears more than once",
))
seen_numbers[scene.scene_number] = seen_numbers.get(scene.scene_number, 0) + 1
# 2. Similar character names (possible duplicates)
all_characters: set[str] = set()
for scene in scene_array.scenes:
all_characters.update(scene.characters_present)
char_list = sorted(all_characters)
for i, name_a in enumerate(char_list):
for name_b in char_list[i + 1:]:
ratio = SequenceMatcher(None, name_a.upper(), name_b.upper()).ratio()
if ratio > 0.8 and name_a != name_b:
warnings.append(ValidationWarning(
type="similar_character_names",
scene_number=None,
message=f"Possible duplicate characters: '{name_a}' and '{name_b}' (similarity: {ratio:.0%})",
))
# 3. Scene count deviation
extracted_count = len(scene_array.scenes)
if heading_count > 0:
deviation = abs(extracted_count - heading_count) / heading_count
if deviation > 0.20:
warnings.append(ValidationWarning(
type="scene_count_deviation",
scene_number=None,
message=f"Extracted {extracted_count} scenes but Layer 1 found {heading_count} scene headings (deviation: {deviation:.0%})",
))
# 4. Empty characters_present
for scene in scene_array.scenes:
if not scene.characters_present:
warnings.append(ValidationWarning(
type="empty_characters",
scene_number=scene.scene_number,
message=f"Scene {scene.scene_number} has no characters listed",
))
# 5. UNKNOWN values
for scene in scene_array.scenes:
scene_dict = scene.model_dump()
for key, value in scene_dict.items():
if value == "UNKNOWN" or (isinstance(value, list) and "UNKNOWN" in value):
warnings.append(ValidationWarning(
type="unknown_values",
scene_number=scene.scene_number,
message=f"Scene {scene.scene_number} field '{key}' contains UNKNOWN",
))
return warnings