Complete working pipeline from Fountain script to validated scene JSON: - Schemas (Pydantic): all 7 layers defined upfront - Fountain parser + normalizer (Layer 1) - AI scene extractor with prompt contracts (Layer 2) - Schema validator + scene-specific semantic validator - Structured JSON logging per layer/scene execution - Versioned output writer (never overwrites) - Retry engine with 4-level failure escalation - Stop condition evaluator (per-unit + global halts) - Diff/drift detector for re-run comparison - CLI entry point with --dry-run, --scene, --test, --force - 3 test scripts (dialogue-heavy, action-heavy, nonstandard) - Expected output files for regression testing Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
93 lines
3.4 KiB
Python
93 lines
3.4 KiB
Python
"""Scene-specific validation — post-schema semantic checks for Layer 2 output."""
|
|
|
|
from dataclasses import dataclass
|
|
from typing import Literal
|
|
from difflib import SequenceMatcher
|
|
from src.schemas.scene_array import SceneArray
|
|
|
|
|
|
@dataclass
|
|
class ValidationWarning:
|
|
type: Literal[
|
|
"duplicate_scene_number",
|
|
"similar_character_names",
|
|
"scene_count_deviation",
|
|
"empty_characters",
|
|
"unknown_values",
|
|
]
|
|
scene_number: int | None
|
|
message: str
|
|
|
|
|
|
def validate_scenes(scene_array: SceneArray, heading_count: int) -> list[ValidationWarning]:
|
|
"""Run semantic validation checks on extracted scenes.
|
|
|
|
Args:
|
|
scene_array: Validated SceneArray from Layer 2.
|
|
heading_count: Number of scene_heading elements from Layer 1 output.
|
|
|
|
Returns:
|
|
List of warnings. Empty list means all checks passed.
|
|
"""
|
|
warnings: list[ValidationWarning] = []
|
|
|
|
# 1. Duplicate scene numbers (already caught by SceneArray validator, but double-check)
|
|
seen_numbers: dict[int, int] = {}
|
|
for scene in scene_array.scenes:
|
|
if scene.scene_number in seen_numbers:
|
|
warnings.append(ValidationWarning(
|
|
type="duplicate_scene_number",
|
|
scene_number=scene.scene_number,
|
|
message=f"Scene number {scene.scene_number} appears more than once",
|
|
))
|
|
seen_numbers[scene.scene_number] = seen_numbers.get(scene.scene_number, 0) + 1
|
|
|
|
# 2. Similar character names (possible duplicates)
|
|
all_characters: set[str] = set()
|
|
for scene in scene_array.scenes:
|
|
all_characters.update(scene.characters_present)
|
|
|
|
char_list = sorted(all_characters)
|
|
for i, name_a in enumerate(char_list):
|
|
for name_b in char_list[i + 1:]:
|
|
ratio = SequenceMatcher(None, name_a.upper(), name_b.upper()).ratio()
|
|
if ratio > 0.8 and name_a != name_b:
|
|
warnings.append(ValidationWarning(
|
|
type="similar_character_names",
|
|
scene_number=None,
|
|
message=f"Possible duplicate characters: '{name_a}' and '{name_b}' (similarity: {ratio:.0%})",
|
|
))
|
|
|
|
# 3. Scene count deviation
|
|
extracted_count = len(scene_array.scenes)
|
|
if heading_count > 0:
|
|
deviation = abs(extracted_count - heading_count) / heading_count
|
|
if deviation > 0.20:
|
|
warnings.append(ValidationWarning(
|
|
type="scene_count_deviation",
|
|
scene_number=None,
|
|
message=f"Extracted {extracted_count} scenes but Layer 1 found {heading_count} scene headings (deviation: {deviation:.0%})",
|
|
))
|
|
|
|
# 4. Empty characters_present
|
|
for scene in scene_array.scenes:
|
|
if not scene.characters_present:
|
|
warnings.append(ValidationWarning(
|
|
type="empty_characters",
|
|
scene_number=scene.scene_number,
|
|
message=f"Scene {scene.scene_number} has no characters listed",
|
|
))
|
|
|
|
# 5. UNKNOWN values
|
|
for scene in scene_array.scenes:
|
|
scene_dict = scene.model_dump()
|
|
for key, value in scene_dict.items():
|
|
if value == "UNKNOWN" or (isinstance(value, list) and "UNKNOWN" in value):
|
|
warnings.append(ValidationWarning(
|
|
type="unknown_values",
|
|
scene_number=scene.scene_number,
|
|
message=f"Scene {scene.scene_number} field '{key}' contains UNKNOWN",
|
|
))
|
|
|
|
return warnings
|