Skip to content

Model Development & Testing

dorsal.testing

RunModelResult pydantic-model

Bases: BaseModel

The standardized result object returned by ModelRunner execution steps.

This object encapsulates the output of a single Annotation Model, including its generated data, source identity, execution timing, and any errors encountered.

Fields:

error pydantic-field

error = None

A descriptive error message if the model failed, crashed, or if a dependency was not met.

name pydantic-field

name

The display name of the model (usually the class name).

record pydantic-field

record

The generated annotation record (dict). None if the model failed, was skipped, or produced no output.

schema_id pydantic-field

schema_id

The validation schema/model ID against which this record was validated.

schema_version pydantic-field

schema_version = None

The version of the schema/model against which this record was validated.

source pydantic-field

source

Structured metadata identifying the model source (ID, version, variant).

time_taken pydantic-field

time_taken = None

Execution time in seconds. Populated only if debug mode is active.

get_json_schema_validator

get_json_schema_validator(schema, strict=False)

Prepares a configured jsonschema validator instance for a given schema.

This function first performs structural validation (metaschema check) to ensure the input schema adheres to the rules of the JSON Schema specification (Draft 2020-12).

Parameters:

Name Type Description Default
schema dict

The JSON Schema (as a dictionary) to validate against.

required
strict bool

If True, performs an added "liveness" check to ensure the schema contains actual validation keywords (e.g., 'type', 'properties'). Defaults to False.

False

Returns:

Name Type Description
JsonSchemaValidatorType JsonSchemaValidatorType

A callable jsonschema validator instance.

Raises:

Type Description
TypeError

If the input schema is not a dictionary.

ValueError

If the input schema is empty, or if strict=True and the schema is found to be inert (lacks validation keywords).

SchemaFormatError

If the input schema is structurally invalid (fails the metaschema check, e.g., 'type' is not a string/array).

DorsalError

For unexpected errors during initialization.

Source code in venv/lib/python3.13/site-packages/dorsal/common/validators/json_schema.py
def get_json_schema_validator(schema: dict, strict: bool = False) -> JsonSchemaValidatorType:
    """
    Prepares a configured jsonschema validator instance for a given schema.

    This function first performs structural validation (metaschema check) to ensure
    the input schema adheres to the rules of the JSON Schema specification (Draft 2020-12).

    Args:
        schema: The JSON Schema (as a dictionary) to validate against.
        strict: If True, performs an **added "liveness" check** to ensure the schema
                contains actual validation keywords (e.g., 'type', 'properties').
                Defaults to False.

    Returns:
        JsonSchemaValidatorType: A callable jsonschema validator instance.

    Raises:
        TypeError: If the input schema is not a dictionary.
        ValueError: If the input schema is empty, or if `strict=True` and the schema
                    is found to be inert (lacks validation keywords).
        SchemaFormatError: If the input schema is structurally invalid (fails
                           the metaschema check, e.g., 'type' is not a string/array).
        DorsalError: For unexpected errors during initialization.
    """

    logger.debug("Preparing custom validator for the provided JSON schema.")

    if not isinstance(cast(Any, schema), dict):
        logger.error("Schema must be a dictionary. Got type: %s", type(schema).__name__)
        raise TypeError(
            f"The 'schema' argument must be a dictionary, got {type(schema).__name__}."
            " Tip: If your schema is a JSON string, use 'json.loads(your_string)' first."
        )

    if not schema:
        logger.error("Schema dictionary cannot be empty.")
        raise ValueError("The 'schema' dictionary cannot be empty.")

    if strict:
        if not any(key in schema for key in JSON_SCHEMA_LIVENESS_KEYWORDS):
            logger.warning("Schema appears to be inert (no validation keywords found).")
            raise ValueError(
                "The provided schema appears to be inert: it contains no known JSON Schema "
                "validation keywords (like 'type', 'properties', etc.) "
                "and would silently pass all validation."
            )

    try:
        Draft202012Validator.check_schema(schema)

        validator_instance = JsonSchemaValidator(schema=schema, format_checker=jsonschema.FormatChecker())
        logger.debug("Custom validator instance prepared successfully for the schema.")
        return validator_instance

    except JSONSchemaSchemaError as err:
        logger.exception(
            "The provided schema is structurally invalid and cannot be used to create a validator: %s",
            err.message,
        )
        raise SchemaFormatError(
            message="The provided schema is invalid and cannot be used to prepare a validator.",
            schema_error_detail=err.message,
        ) from err
    except Exception as err:
        logger.exception("Unexpected error initializing custom validator with the provided schema.")
        raise DorsalError(
            "Could not initialize validator with the provided schema due to an unexpected error."
        ) from err

get_open_schema

get_open_schema(name)

Loads a built-in Dorsal 'open/' validation schema by its short name.

Parameters:

Name Type Description Default
name OpenSchemaName

The short name of the open schema (e.g., "generic", "llm-output"). Provides autocompletion in supported editors.

required

Returns:

Type Description
dict

The JSON schema as a Python dictionary.

Raises:

Type Description
ValueError

If the name is not a valid, known schema.

Source code in venv/lib/python3.13/site-packages/dorsal/file/schemas.py
def get_open_schema(name: OpenSchemaName) -> dict:
    """
    Loads a built-in Dorsal 'open/' validation schema by its short name.

    Args:
        name: The short name of the open schema (e.g., "generic", "llm-output").
              Provides autocompletion in supported editors.

    Returns:
        The JSON schema as a Python dictionary.

    Raises:
        ValueError: If the name is not a valid, known schema.
    """
    schema_filename = OPEN_SCHEMA_NAME_MAP.get(name)

    if schema_filename is None:
        raise ValueError(f"Unknown schema name: '{name}'.")

    return _load_schema_from_package(schema_filename)

get_open_schema_validator

get_open_schema_validator(name)

Gets the pre-built, cached JsonSchemaValidator instance for a Dorsal 'open/' schema by its short name.

Source code in venv/lib/python3.13/site-packages/dorsal/file/validators/open_schema.py
def get_open_schema_validator(name: OpenSchemaName) -> JsonSchemaValidator:
    """
    Gets the pre-built, cached JsonSchemaValidator instance for a
    Dorsal 'open/' schema by its short name.
    """
    if name not in OPEN_SCHEMA_NAME_MAP:
        raise ValueError(f"Unknown schema name: '{name}'.")
    return _build_and_cache_validator(name)

make_file_extension_dependency

make_file_extension_dependency(extensions, silent=True)

Helper function to create a file extension dependency configuration.

Parameters:

Name Type Description Default
extensions Sequence[str]

A sequence (list or tuple) of file extensions (e.g., [".pdf", ".txt"]).

required
silent bool

If False, raises an error if the dependency isn't met.

True
Source code in venv/lib/python3.13/site-packages/dorsal/file/dependencies.py
def make_file_extension_dependency(
    extensions: Sequence[str],
    silent: bool = True,
) -> FileExtensionDependencyConfig:
    """
    Helper function to create a file extension dependency configuration.

    Args:
        extensions: A sequence (list or tuple) of file extensions (e.g., [".pdf", ".txt"]).
        silent: If False, raises an error if the dependency isn't met.
    """
    if isinstance(extensions, str):
        raise TypeError(
            f"The 'extensions' argument must be a sequence (like a list or tuple) of strings, not a single string.\n"
            f'       Did you mean: extensions=["{extensions}"] ?'
        )

    if not extensions:
        raise ValueError(
            "A file extension dependency must have at least one extension.\n"
            "If the model should run on all file types, call 'dorsal.testing.run_model' without 'dependencies' instead."
        )

    processed_extensions = {f".{ext.lstrip('.').lower()}" for ext in extensions}

    return FileExtensionDependencyConfig(
        extensions=processed_extensions,
        silent=silent,
    )

make_media_type_dependency

make_media_type_dependency(
    include=None, exclude=None, pattern=None, silent=True
)

Helper function to create a media type dependency configuration.

Parameters:

Name Type Description Default
include Sequence[str] | None

A sequence (list or tuple) of media types (e.g., ["application/pdf"]).

None
exclude Sequence[str] | None

A sequence (list or tuple) of media types to explicitly exclude.

None
pattern str | Pattern | None

A regex pattern to match against the media type.

None
silent bool

If False, raises an error if the dependency isn't met.

True
Source code in venv/lib/python3.13/site-packages/dorsal/file/dependencies.py
def make_media_type_dependency(
    include: Sequence[str] | None = None,
    exclude: Sequence[str] | None = None,
    pattern: str | re.Pattern | None = None,
    silent: bool = True,
) -> MediaTypeDependencyConfig:
    """
    Helper function to create a media type dependency configuration.

    Args:
        include: A sequence (list or tuple) of media types (e.g., ["application/pdf"]).
        exclude: A sequence (list or tuple) of media types to explicitly exclude.
        pattern: A regex pattern to match against the media type.
        silent: If False, raises an error if the dependency isn't met.
    """
    if isinstance(include, str):
        raise TypeError(
            f"The 'include' argument must be a sequence (like a list or tuple) of strings, not a single string.\n"
            f'       Did you mean: include=["{include}"] ?'
        )

    if isinstance(exclude, str):
        raise TypeError(
            f"The 'exclude' argument must be a sequence (like a list or tuple) of strings, not a single string.\n"
            f'       Did you mean: exclude=["{exclude}"] ?'
        )

    if not include and not exclude and not pattern:
        raise ValueError(
            "A media type dependency must have at least one rule ('include', 'exclude', or 'pattern').\n"
            "If the model should run on all media types, call 'dorsal.testing.run_model' without 'dependencies' instead."
        )

    return MediaTypeDependencyConfig(
        include=set(include) if include else None,
        exclude=set(exclude) if exclude else None,
        pattern=pattern,
        silent=silent,
    )

run_model

run_model(
    annotation_model,
    file_path,
    *,
    schema_id=None,
    validation_model=None,
    dependencies=None,
    options=None
)

Tests a single AnnotationModel in isolation.

  1. FileCoreAnnotationModel retrieves base metadata.
  2. (Optional) Checks your model's dependencies
  3. Runs your model
  4. Returns the result of your model's execution.

Parameters:

Name Type Description Default
model_class

The custom AnnotationModel class you want to test (e.g., ArchiveModel).

required
file_path str

The absolute path to the file to test against.

required
schema_id str | None

(Optional) The target schema ID (e.g., "open/generic"). If this is an "open/" schema, the standard validator will be used automatically.

None
validation_model Type[BaseModel] | JsonSchemaValidator | None

(Optional) A custom Pydantic model or JsonSchemaValidator. This overrides the automatic validator from 'schema_id'.

None
dependencies list[ModelRunnerDependencyConfig] | ModelRunnerDependencyConfig | None

(Optional) A list of dependency configs to check before running.

None
options dict[str, Any] | None

(Optional) A dictionary of options to pass to the model's .main() method.

None

Returns:

Type Description
RunModelResult

A RunModelResult object containing your model's output or any errors.

Raises:

Type Description
ValueError

If 'schema_id' is an "open/" schema and a 'validation_model' is also provided, as this is an ambiguous configuration.

Source code in venv/lib/python3.13/site-packages/dorsal/testing.py
def run_model(
    annotation_model: Type[AnnotationModel],
    file_path: str,
    *,
    schema_id: str | None = None,
    validation_model: Type[BaseModel] | JsonSchemaValidator | None = None,
    dependencies: list[ModelRunnerDependencyConfig] | ModelRunnerDependencyConfig | None = None,
    options: dict[str, Any] | None = None,
) -> RunModelResult:
    """
    Tests a single AnnotationModel in isolation.

    1.  `FileCoreAnnotationModel` retrieves base metadata.
    2.  (Optional) Checks your model's dependencies
    3.  Runs your model
    4.  Returns the result of your model's execution.

    Args:
        model_class: The custom AnnotationModel class you want to test (e.g., `ArchiveModel`).
        file_path: The absolute path to the file to test against.
        schema_id: (Optional) The target schema ID (e.g., "open/generic").
                   If this is an "open/" schema, the standard validator
                   will be used automatically.
        validation_model: (Optional) A *custom* Pydantic model or
                          JsonSchemaValidator. This overrides the
                          automatic validator from 'schema_id'.
        dependencies: (Optional) A list of dependency configs to check before running.
        options: (Optional) A dictionary of options to pass to the model's `.main()` method.

    Returns:
        A RunModelResult object containing your model's output or any errors.

    Raises:
        ValueError: If 'schema_id' is an "open/" schema and a
                    'validation_model' is also provided, as this
                    is an ambiguous configuration.
    """
    runner = ModelRunner(pipeline_config=None, debug=True, testing=True)

    logger.info(f"Running FileCoreAnnotationModel on {file_path}...")
    base_model_result = runner.run_single_model(
        annotation_model=FileCoreAnnotationModel,
        validation_model=FileCoreValidationModelStrict,
        file_path=file_path,
        options={"calculate_similarity_hash": True},
    )

    if base_model_result.error or base_model_result.record is None:
        logger.error(f"Base model failed, cannot proceed: {base_model_result.error}")
        return base_model_result

    deps_list = dependencies
    if deps_list and not isinstance(deps_list, list):
        deps_list = [deps_list]

    if deps_list:
        logger.info("Checking dependencies...")
        for dep_config in deps_list:
            is_met = False
            if isinstance(dep_config, MediaTypeDependencyConfig):
                is_met = check_media_type_dependency([base_model_result], dep_config)
            elif isinstance(dep_config, FileExtensionDependencyConfig):
                is_met = check_extension_dependency([base_model_result], dep_config)
            elif isinstance(dep_config, FilenameDependencyConfig):
                is_met = check_name_dependency([base_model_result], dep_config)
            elif isinstance(dep_config, FileSizeDependencyConfig):
                is_met = check_size_dependency([base_model_result], dep_config)
            if not is_met:
                dep_type = getattr(dep_config, "type", "Unknown")
                error_msg = f"Skipped: Dependency not met: {dep_type}"
                logger.warning(error_msg)
                return RunModelResult(
                    name=annotation_model.id or annotation_model.__name__,
                    source=AnnotationModelSource(
                        type="Model",
                        id=annotation_model.id or annotation_model.__name__,
                        version=annotation_model.version,
                        variant=annotation_model.variant,
                    ),
                    record=None,
                    schema_id=schema_id,
                    error=error_msg,
                )
        logger.info("All dependencies met.")

    if validation_model and schema_id and schema_id.startswith("open/"):
        raise ValueError(
            f"Ambiguous configuration: You cannot provide a custom 'validation_model' when using an 'open/' schema_id ('{schema_id}').\n"
            f"The 'open/' schemas use a standard, built-in validator.\n"
            f"  - To test with the standard validator: Remove the 'validation_model' argument.\n"
            f"  - To test with your custom validator: Use a custom 'schema_id' (e.g., 'my-org/my-custom-schema') or set schema_id=None."
        )

    effective_validator: Type[BaseModel] | JsonSchemaValidator | None = validation_model

    if effective_validator:
        logger.debug("Using explicitly provided 'validation_model'.")
    elif schema_id and schema_id.startswith("open/"):
        schema_name = schema_id.removeprefix("open/")
        try:
            effective_validator = get_open_schema_validator(cast(OpenSchemaName, schema_name))
            logger.debug("Resolved 'schema_id' (%s) to standard validator.", schema_id)
        except (ValueError, TypeError, RuntimeError) as e:
            logger.warning(
                "Could not find a standard validator for 'schema_id' (%s): %s. Proceeding without validation.",
                schema_id,
                e,
            )

    if annotation_model.__module__ == "__main__":
        logger.debug(
            "The 'annotation_model' (%s) is defined in your main script. "
            "Move it to an importable .py file before using 'register_model' or that function will complain.",
            annotation_model.__name__,
        )

    if effective_validator and effective_validator.__module__ == "__main__":
        validator_name = getattr(effective_validator, "__name__", str(effective_validator))
        logger.debug(
            "The 'validation_model' (%s) is defined in your main script. "
            "Move it to an importable .py file before using 'register_model' or that function will complain.",
            validator_name,
        )

    logger.info("Running %s on file: '%s'", annotation_model.__name__, file_path)
    my_model_result = runner.run_single_model(
        annotation_model=annotation_model,
        validation_model=effective_validator,
        file_path=file_path,
        base_model_result=base_model_result,
        schema_id=schema_id,
        options=options,
    )

    return my_model_result

dorsal.file.configs.model_runner

DependencyConfig pydantic-model

Bases: BaseModel

  • type (str): The primary identifier of the dependency.
  • checker (CallableImportPath) - Defines the path to the dependency check function - The dependency check function returns a boolean indicating whether the dependency was met - The dependency check function always takes, as input, the list of prior model outputs (which is always at least the base file model result, at index 0)
  • silent (bool): - When set to False, raises a DependencyNotMetError exception whenever the dependency is not met - When set to True, no exception is raised in the case of a dependency not being met.

Fields:

  • type (str)
  • checker (CallableImportPath)
  • silent (bool)

FileExtensionDependencyConfig pydantic-model

Bases: DependencyConfig

This dependency configures which file extensions to execute a model for.

Fields:

  • type (Literal['extension'])
  • checker (CallableImportPath)
  • silent (bool)
  • extensions (set[str])

FileSizeDependencyConfig pydantic-model

Bases: DependencyConfig

This dependency configures a model to run based on file size.

Fields:

  • type (Literal['file_size'])
  • checker (CallableImportPath)
  • silent (bool)
  • min_size (int | None)
  • max_size (int | None)

FilenameDependencyConfig pydantic-model

Bases: DependencyConfig

This dependency configures a model to run based on the file's name.

Fields:

MediaTypeDependencyConfig pydantic-model

Bases: DependencyConfig

This dependency configures which Media Types to execute a model for.

You can define the match rule for Media Type using any combination of pattern, include, or exclude.

  • silent (bool = True): - by default, when not met, the MediaTypeDependencyConfig does not raise. - Set to False if you want it to raise an exception
  • pattern (str or re.Pattern): match the media type using a regular expression. If the media type matches, the model executes
  • include - If the Media Type is in this sequence, the model executes
  • exclude - Exclusion rule: if the media type is in this sequence (even if it matches via pattern or include) it is blocked

Fields:

  • type (Literal['media_type'])
  • checker (CallableImportPath)
  • silent (bool)
  • pattern (str | Pattern | None)
  • include (set[MediaTypePartString] | None)
  • exclude (set[MediaTypePartString] | None)

ModelRunnerPipelineStep pydantic-model

Bases: BaseModel

Single step in the ModelRunner execution pipeline.

  • annotation_model: Two-part path to an Annotation Model.
  • dependencies: Rules to trigger execution.
  • validation_model: Path to validation logic.
  • schema_id: Unique dataset ID.
  • options: Runtime options for the model.
  • ignore_linter_errors: Skip strict linting if True.
  • deactivated: (Optional) If True, this step is skipped. Defaults to False.

Fields:

  • annotation_model (CallableImportPath)
  • dependencies (list[ModelRunnerDependencyConfig] | None)
  • validation_model (CallableImportPath | dict | None)
  • schema_id (DatasetID)
  • options (dict[str, Any] | None)
  • ignore_linter_errors (bool)
  • deactivated (bool)

RunModelResult pydantic-model

Bases: BaseModel

The standardized result object returned by ModelRunner execution steps.

This object encapsulates the output of a single Annotation Model, including its generated data, source identity, execution timing, and any errors encountered.

Fields:

error pydantic-field

error = None

A descriptive error message if the model failed, crashed, or if a dependency was not met.

name pydantic-field

name

The display name of the model (usually the class name).

record pydantic-field

record

The generated annotation record (dict). None if the model failed, was skipped, or produced no output.

schema_id pydantic-field

schema_id

The validation schema/model ID against which this record was validated.

schema_version pydantic-field

schema_version = None

The version of the schema/model against which this record was validated.

source pydantic-field

source

Structured metadata identifying the model source (ID, version, variant).

time_taken pydantic-field

time_taken = None

Execution time in seconds. Populated only if debug mode is active.

check_extension_dependency

check_extension_dependency(model_results, config)

Check whether the extension is within the scope of the annotation model.

Source code in venv/lib/python3.13/site-packages/dorsal/file/configs/model_runner.py
def check_extension_dependency(model_results: list[RunModelResult], config: FileExtensionDependencyConfig) -> bool:
    """Check whether the extension is within the scope of the annotation model."""
    base_record = model_results[0].record

    if not base_record:
        logger.debug("No base record found")
        return False

    extension: str | None = base_record["extension"]
    if not extension:
        return False

    if extension.lower() in config.extensions:
        return True

    return False

check_media_type_dependency

check_media_type_dependency(model_results, config)

Check whether the media type is within the scope of the annotation model.

  • Performs exact string matches or regex matches on the full and partial media type.
  • Checks both full and partial media type in the following order:
    1. If in exclude, return False
    2. If in include or matching pattern, return True
    3. If include or pattern were provided, then it failed. Return False
    4. If neither include nor pattern were provided, then it passed. Return True
Source code in venv/lib/python3.13/site-packages/dorsal/file/configs/model_runner.py
def check_media_type_dependency(model_results: "list[RunModelResult]", config: "MediaTypeDependencyConfig") -> bool:
    """
    Check whether the media type is within the scope of the annotation model.

    - Performs exact string matches or regex matches on the full and partial media type.
    - Checks both *full* and *partial* media type in the following order:
        1. If in `exclude`, return False
        2. If in `include` or matching `pattern`, return True
        3. If `include` or `pattern` were provided, then it failed. Return False
        4. If neither `include` nor `pattern` were provided, then it passed. Return True
    """
    base_record = model_results[0].record

    if not base_record:
        logger.debug("No base record found for media type dependency check")
        return False

    media_type_full = base_record.get("media_type")
    if not media_type_full:
        logger.debug("Base record has no 'media_type' field")
        return False

    media_type_head, _ = media_type_full.split("/")

    if config.exclude:
        if media_type_full in config.exclude or media_type_head in config.exclude:
            logger.debug(f"Media type {media_type_full} is explicitly excluded.")
            return False

    has_inclusion_rules = config.include or config.pattern

    if has_inclusion_rules:
        if config.include:
            if media_type_full in config.include or media_type_head in config.include:
                logger.debug(f"Media type {media_type_full} matched 'include' list.")
                return True

        if config.pattern:
            if isinstance(config.pattern, str):
                rx = re.compile(config.pattern)
            else:
                rx = config.pattern

            m_full = rx.match(media_type_full)
            if m_full:
                logger.debug(f"Media type {media_type_full} matched 'pattern' (full).")
                return True

            m_head = rx.match(media_type_head)
            if m_head:
                logger.debug(f"Media type {media_type_full} matched 'pattern' (head).")
                return True

        logger.debug(f"Media type {media_type_full} did not match any 'include' or 'pattern' rules.")
        return False

    logger.debug(f"Media type {media_type_full} passed (no inclusion rules specified).")
    return True

check_name_dependency

check_name_dependency(model_results, config)

Check whether the file's name matches the regex pattern.

Source code in venv/lib/python3.13/site-packages/dorsal/file/configs/model_runner.py
def check_name_dependency(model_results: list[RunModelResult], config: FilenameDependencyConfig) -> bool:
    """Check whether the file's name matches the regex pattern."""
    base_record = model_results[0].record

    if not base_record:
        logger.debug("No base record found for filename dependency check")
        return False

    filename = base_record.get("name")
    if not filename:
        logger.debug("Base record has no 'name' field")
        return False

    if isinstance(config.pattern, str):
        rx = re.compile(config.pattern)
    else:
        rx = config.pattern

    if rx.search(filename):
        logger.debug(f"Filename '{filename}' matched pattern '{config.pattern}'.")
        return True

    logger.debug(f"Filename '{filename}' did not match pattern '{config.pattern}'.")
    return False

check_size_dependency

check_size_dependency(model_results, config)

Check whether the file size is within the scope of the annotation model.

Source code in venv/lib/python3.13/site-packages/dorsal/file/configs/model_runner.py
def check_size_dependency(model_results: list[RunModelResult], config: FileSizeDependencyConfig) -> bool:
    """Check whether the file size is within the scope of the annotation model."""
    base_record = model_results[0].record

    if not base_record:
        logger.debug("No base record found for file size dependency check")
        return False

    file_size = base_record.get("size")
    if not isinstance(file_size, int):
        logger.debug("Base record has no valid 'size' field")
        return False

    if config.min_size is not None and file_size < config.min_size:
        logger.debug(f"File size {file_size} is less than min_size {config.min_size}.")
        return False

    if config.max_size is not None and file_size > config.max_size:
        logger.debug(f"File size {file_size} is greater than max_size {config.max_size}.")
        return False

    logger.debug(f"File size {file_size} passed dependency check.")
    return True