High Level Functions

These functions provide a high-level interface for generating and managing file records, both locally and on DorsalHub.

dorsal.api

activate_model_by_name

activate_model_by_name(name, scope='project')

Activates (enables) a model in the pipeline by name.

Source code in venv/lib/python3.13/site-packages/dorsal/api/config.py

def activate_model_by_name(name: str, scope: Literal["project", "global"] = "project") -> None:
    """Activates (enables) a model in the pipeline by name."""
    PipelineConfig.set_step_status_by_name(name=name, active=True, scope=scope)

add_files_to_collection

add_files_to_collection(
    collection_id, hashes, api_key=None
)

Adds a list of files to a remote collection by their hash.

This function automatically handles batching for large lists of hashes to comply with the API limit (10,000 per request).

Parameters:

Name	Type	Description	Default
`collection_id`	`str`	The unique ID of the collection to modify.	required
`hashes`	`list[str]`	A list of SHA-256 file hashes to add.	required
`api_key`	`str`	An API key for this request.	`None`

Returns:

Name	Type	Description
`AddFilesResponse`	`AddFilesResponse`	A consolidated response summarizing the results of all batches.

Raises:

Type	Description
`DorsalClientError`	For API errors.
`DorsalError`	For other unexpected library errors.

Source code in venv/lib/python3.13/site-packages/dorsal/api/collection.py

def add_files_to_collection(
    collection_id: str,
    hashes: list[str],
    api_key: str | None = None,
) -> "AddFilesResponse":
    """
    Adds a list of files to a remote collection by their hash.

    This function automatically handles batching for large lists of hashes
    to comply with the API limit (10,000 per request).

    Args:
        collection_id (str): The unique ID of the collection to modify.
        hashes (list[str]): A list of SHA-256 file hashes to add.
        api_key (str, optional): An API key for this request.

    Returns:
        AddFilesResponse: A consolidated response summarizing the results of all batches.

    Raises:
        DorsalClientError: For API errors.
        DorsalError: For other unexpected library errors.
    """
    from dorsal.session import get_shared_dorsal_client
    from dorsal.client import DorsalClient
    from dorsal.client.validators import AddFilesResponse

    if not hashes:
        raise ValueError("The 'hashes' list cannot be empty.")

    effective_client: DorsalClient
    log_message_context = "using shared client"
    if api_key:
        log_message_context = "using temporary client with provided API key"
        effective_client = DorsalClient(api_key=api_key)
    else:
        effective_client = get_shared_dorsal_client()

    logger.debug(
        "Dispatching add_files_to_collection to client (%s) for ID: '%s' with %d hashes.",
        log_message_context,
        collection_id,
        len(hashes),
    )

    total_added = 0
    total_duplicates = 0
    total_invalid = 0
    batches = [hashes[i : i + API_MAX_BATCH_SIZE] for i in range(0, len(hashes), API_MAX_BATCH_SIZE)]

    try:
        for i, batch in enumerate(batches):
            logger.debug(f"Processing batch {i + 1}/{len(batches)}...")
            response = effective_client.add_files_to_collection(collection_id=collection_id, hashes=batch)
            total_added += response.added_count
            total_duplicates += response.duplicate_count
            total_invalid += response.invalid_count

        aggregate_response = AddFilesResponse(
            added_count=total_added,
            duplicate_count=total_duplicates,
            invalid_count=total_invalid,
        )
        logger.info(
            "Successfully finished adding files to collection '%s'. Added: %d, Duplicates: %d, Invalid: %s",
            collection_id,
            aggregate_response.added_count,
            aggregate_response.duplicate_count,
            aggregate_response.invalid_count,
        )
        return aggregate_response

    except DorsalError as err:
        logger.warning(
            "A client error occurred during add_files_to_collection (%s): %s",
            log_message_context,
            err,
        )
        raise
    except Exception as err:
        logger.exception(
            "An unexpected error occurred during add_files_to_collection (%s).",
            log_message_context,
        )
        raise DorsalError(
            f"An unexpected error occurred while adding files to collection '{collection_id}': {err}"
        ) from err

add_tag_to_file

add_tag_to_file(
    hash_string, name, value, private, api_key=None
)

Adds a single tag to a file record on DorsalHub.

Parameters:

Name	Type	Description	Default
`hash_string`	`str`	The hash of the file record to tag.	required
`name`	`str`	The name of the tag.	required
`value`	`Any`	The value of the tag.	required
`private`	`bool`	The visibility of the tag itself.	required
`api_key`	`str`	An API key for this request.	`None`

Returns:

Name	Type	Description
`FileTagResponse`	`FileTagResponse`	A response object from the API.

Source code in venv/lib/python3.13/site-packages/dorsal/api/file.py

def add_tag_to_file(
    hash_string: str, name: str, value: Any, private: bool, api_key: str | None = None
) -> FileTagResponse:
    """
    Adds a single tag to a file record on DorsalHub.

    Args:
        hash_string (str): The hash of the file record to tag.
        name (str): The name of the tag.
        value (Any): The value of the tag.
        private (bool): The visibility of the tag itself.
        api_key (str, optional): An API key for this request.

    Returns:
        FileTagResponse: A response object from the API.
    """
    from dorsal.session import get_shared_dorsal_client
    from dorsal.file.validators.file_record import NewFileTag

    effective_client = get_shared_dorsal_client()
    if api_key:
        from dorsal.client import DorsalClient

        effective_client = DorsalClient(api_key=api_key)

    try:
        new_tag = NewFileTag(name=name, value=value, private=private)
        tag_result = effective_client.add_tags_to_file(file_hash=hash_string, tags=[new_tag])
        return tag_result
    except (DorsalClientError, ValueError):
        raise

deactivate_model_by_name

deactivate_model_by_name(name, scope='project')

Deactivates (disables) a model in the pipeline by name.

Source code in venv/lib/python3.13/site-packages/dorsal/api/config.py

def deactivate_model_by_name(name: str, scope: Literal["project", "global"] = "project") -> None:
    """Deactivates (disables) a model in the pipeline by name."""
    PipelineConfig.set_step_status_by_name(name=name, active=False, scope=scope)

delete_collection

delete_collection(collection_id, api_key=None)

Deletes a file collection from DorsalHub by its ID.

Parameters:

Name	Type	Description	Default
`collection_id`	`str`	The unique ID of the collection to delete.	required
`api_key`	`str`	An API key for this request.	`None`

Raises:

Type	Description
`DorsalClientError`	For API errors.
`DorsalError`	For other unexpected library errors.

Source code in venv/lib/python3.13/site-packages/dorsal/api/collection.py

def delete_collection(
    collection_id: str,
    api_key: str | None = None,
) -> None:
    """
    Deletes a file collection from DorsalHub by its ID.

    Args:
        collection_id (str): The unique ID of the collection to delete.
        api_key (str, optional): An API key for this request.

    Raises:
        DorsalClientError: For API errors.
        DorsalError: For other unexpected library errors.
    """
    from dorsal.session import get_shared_dorsal_client
    from dorsal.client import DorsalClient

    if not collection_id:
        raise ValueError("collection_id cannot be empty.")

    effective_client: DorsalClient
    log_message_context = "using shared client"
    if api_key:
        log_message_context = "using temporary client with provided API key"
        effective_client = DorsalClient(api_key=api_key)
    else:
        effective_client = get_shared_dorsal_client()

    logger.debug(
        "Dispatching delete_collections to client (%s) for ID: '%s'",
        log_message_context,
        collection_id,
    )

    try:
        effective_client.delete_collections(collection_ids=[collection_id])
        logger.info("Successfully dispatched deletion for collection '%s'", collection_id)
        return

    except DorsalError as err:
        logger.warning(
            "A client error occurred during delete_dorsal_collection (%s): %s",
            log_message_context,
            err,
        )
        raise
    except Exception as err:
        logger.exception(
            "An unexpected error occurred during delete_dorsal_collection (%s).",
            log_message_context,
        )
        raise DorsalError(f"An unexpected error occurred while deleting collection '{collection_id}': {err}") from err

delete_private_dorsal_file_record

delete_private_dorsal_file_record(
    file_hash,
    *,
    tags="none",
    annotations="none",
    api_key=None
)

Delete the private file record for the given hash.

Parameters:

Name	Type	Description	Default
`file_hash`	`str`	The SHA-256 hash of the file record to delete.	required
`tags`	`DeletionScope \| None`	Specifies which tags to delete. Options: "all", "public", "private", "none", or None (treated as "none"). Defaults to "none".	`'none'`
`annotations`	`DeletionScope \| None`	Specifies which annotations to delete. Options: "all", "public", "private", "none", or None (treated as "none"). Defaults to "none".	`'none'`
`api_key`	`str`	An API key to use for this request.	`None`

Returns:

Name	Type	Description
`FileDeleteResponse`	`'FileDeleteResponse'`	An object summarizing the result of the delete operation.

Raises:

Type	Description
`ValueError`	If the provided `file_hash` is not valid.
`DorsalClientError`	For API errors (e.g., not found, permission denied).
`DorsalError`	For other unexpected library errors.

Source code in venv/lib/python3.13/site-packages/dorsal/api/file.py

def delete_private_dorsal_file_record(
    file_hash: str,
    *,
    tags: DeletionScope | None = "none",
    annotations: DeletionScope | None = "none",
    api_key: str | None = None,
) -> "FileDeleteResponse":
    """
    Delete the private file record for the given hash.

    Args:
        file_hash (str): The SHA-256 hash of the file record to delete.
        tags (DeletionScope | None): Specifies which tags to delete.
            Options: "all", "public", "private", "none", or None (treated as "none").
            Defaults to "none".
        annotations (DeletionScope | None): Specifies which annotations to delete.
            Options: "all", "public", "private", "none", or None (treated as "none").
            Defaults to "none".
        api_key (str, optional): An API key to use for this request.

    Returns:
        FileDeleteResponse: An object summarizing the result of the delete operation.

    Raises:
        ValueError: If the provided `file_hash` is not valid.
        DorsalClientError: For API errors (e.g., not found, permission denied).
        DorsalError: For other unexpected library errors.
    """
    logger.info(
        "Requesting deletion of PRIVATE record for hash %s (tags=%s, annotations=%s)",
        file_hash,
        tags or "none",
        annotations or "none",
    )

    return _delete_dorsal_file_record(
        file_hash=file_hash,
        record="private",
        tags=tags if tags is not None else "none",
        annotations=annotations if annotations is not None else "none",
        api_key=api_key,
    )

delete_public_dorsal_file_record

delete_public_dorsal_file_record(
    file_hash,
    *,
    tags="none",
    annotations="none",
    api_key=None
)

Delete the public file record for the given hash.

Parameters:

Name	Type	Description	Default
`file_hash`	`str`	The SHA-256 hash of the file record to delete.	required
`tags`	`DeletionScope \| None`	Specifies which tags to delete. Options: "all", "public", "private", "none", or None (treated as "none"). Defaults to "none".	`'none'`
`annotations`	`DeletionScope \| None`	Specifies which annotations to delete. Options: "all", "public", "private", "none", or None (treated as "none"). Defaults to "none".	`'none'`
`api_key`	`str`	An API key to use for this request.	`None`

Returns:

Name	Type	Description
`FileDeleteResponse`	`'FileDeleteResponse'`	An object summarizing the result of the delete operation.

Raises:

Type	Description
`ValueError`	If the provided `file_hash` is not valid.
`DorsalClientError`	For API errors (e.g., not found, permission denied).
`DorsalError`	For other unexpected library errors.

Source code in venv/lib/python3.13/site-packages/dorsal/api/file.py

def delete_public_dorsal_file_record(
    file_hash: str,
    *,
    tags: DeletionScope | None = "none",
    annotations: DeletionScope | None = "none",
    api_key: str | None = None,
) -> "FileDeleteResponse":
    """
    Delete the public file record for the given hash.

    Args:
        file_hash (str): The SHA-256 hash of the file record to delete.
        tags (DeletionScope | None): Specifies which tags to delete.
            Options: "all", "public", "private", "none", or None (treated as "none").
            Defaults to "none".
        annotations (DeletionScope | None): Specifies which annotations to delete.
            Options: "all", "public", "private", "none", or None (treated as "none").
            Defaults to "none".
        api_key (str, optional): An API key to use for this request.

    Returns:
        FileDeleteResponse: An object summarizing the result of the delete operation.

    Raises:
        ValueError: If the provided `file_hash` is not valid.
        DorsalClientError: For API errors (e.g., not found, permission denied).
        DorsalError: For other unexpected library errors.
    """
    logger.info(
        "Requesting deletion of PUBLIC record for hash %s (tags=%s, annotations=%s)",
        file_hash,
        tags or "none",
        annotations or "none",
    )

    return _delete_dorsal_file_record(
        file_hash=file_hash,
        record="public",
        tags=tags if tags is not None else "none",
        annotations=annotations if annotations is not None else "none",
        api_key=api_key,
    )

export_collection

export_collection(
    collection_id,
    output_path,
    poll_interval=5,
    timeout=3600,
    api_key=None,
    console=None,
    palette=None,
)

Exports a remote file collection from DorsalHub.

Starts an export job, polls for its completion, and downloads to a local path.

The exported file will be in .json.gz format.

Parameters:

Name	Type	Description	Default
`collection_id`	`str`	The ID of the collection to export.	required
`output_path`	`str \| Path`	The local path to save the exported file.	required
`poll_interval`	`int`	Seconds to wait between status checks.	`5`
`timeout`	`int \| None`	Total seconds to wait for the job to complete.	`3600`
`api_key`	`str`	An API key for this request.	`None`
`console`	`Console`	A rich.console.Console for progress display.	`None`
`palette`	`dict`	Color palette for the progress bar.	`None`

Raises:

Type	Description
`DorsalClientError`	For API errors.
`DorsalError`	For other unexpected library errors.

Source code in venv/lib/python3.13/site-packages/dorsal/api/collection.py

def export_collection(
    collection_id: str,
    output_path: str | pathlib.Path,
    poll_interval: int = 5,
    timeout: int | None = 3600,
    api_key: str | None = None,
    console: "Console | None" = None,
    palette: dict | None = None,
) -> None:
    """
    Exports a remote file collection from DorsalHub.

    Starts an export job, polls for its completion, and downloads to a local path.

    The exported file will be in .json.gz format.

    Args:
        collection_id (str): The ID of the collection to export.
        output_path (str | pathlib.Path): The local path to save the exported file.
        poll_interval (int): Seconds to wait between status checks.
        timeout (int | None): Total seconds to wait for the job to complete.
        api_key (str, optional): An API key for this request.
        console (Console, optional): A rich.console.Console for progress display.
        palette (dict, optional): Color palette for the progress bar.

    Raises:
        DorsalClientError: For API errors.
        DorsalError: For other unexpected library errors.
    """
    from dorsal.session import get_shared_dorsal_client
    from dorsal.client import DorsalClient

    effective_client: DorsalClient
    log_message_context = "using shared client"
    if api_key:
        log_message_context = "using temporary client with provided API key"
        effective_client = DorsalClient(api_key=api_key)
    else:
        effective_client = get_shared_dorsal_client()

    logger.debug(
        "Dispatching export_collection to client (%s) for ID: '%s'",
        log_message_context,
        collection_id,
    )

    try:
        effective_client.export_collection(
            collection_id=collection_id,
            output_path=str(output_path),
            poll_interval=poll_interval,
            timeout=timeout,
            console=console,
            palette=palette,
        )
        logger.info("Successfully exported collection '%s' to '%s'", collection_id, output_path)

    except DorsalError as err:
        logger.warning(
            "A client error occurred during export_dorsal_collection (%s): %s",
            log_message_context,
            err,
        )
        raise
    except Exception as err:
        logger.exception(
            "An unexpected error occurred during export_dorsal_collection (%s).",
            log_message_context,
        )
        raise DorsalError(f"An unexpected error occurred while exporting collection '{collection_id}': {err}") from err

find_duplicates

find_duplicates(
    path,
    recursive=False,
    min_size=0,
    max_size=None,
    mode="hybrid",
    use_cache=True,
    progress_console=None,
    palette=None,
)

Finds duplicate files in a directory using a multi-pass filtering strategy.

Source code in venv/lib/python3.13/site-packages/dorsal/api/file.py

def find_duplicates(
    path: str | pathlib.Path,
    recursive: bool = False,
    min_size: str | int = 0,
    max_size: str | int | None = None,
    mode: Literal["hybrid", "quick", "sha256"] = "hybrid",
    use_cache: bool = True,
    progress_console: "Console" | None = None,
    palette: dict | None = None,
) -> dict:
    """
    Finds duplicate files in a directory using a multi-pass filtering strategy.
    """

    path = pathlib.Path(path)
    min_size_bytes = parse_filesize(min_size) if isinstance(min_size, str) else min_size
    max_size_bytes = parse_filesize(max_size) if isinstance(max_size, str) else max_size

    candidate_files, total_scanned, inaccessible_count = _filter_by_size(
        path, recursive, min_size_bytes, max_size_bytes, progress_console, palette
    )
    if not candidate_files:
        logger.debug("No potential duplicates found based on file size. Finished.")
        return {}

    results = {}
    total_cache_hits = 0

    if mode == "hybrid":
        logger.debug("Pass 2/3: Identifying potential duplicates with QUICK hash...")
        quick_hash_map, quick_cache_hits = _find_duplicates_quick(candidate_files, use_cache, progress_console, palette)
        total_cache_hits += quick_cache_hits

        potential_duplicates = [pathlib.Path(p) for s in quick_hash_map.values() if len(s) > 1 for p in s]

        if not potential_duplicates:
            logger.debug("No potential duplicates found after QUICK hash pass. Finished.")
            results = _format_duplicate_results(path=str(path), hash_map=quick_hash_map)
        else:
            logger.debug(
                "Pass 2 complete. Found %d potential duplicates to verify with SHA-256.",
                len(potential_duplicates),
            )
            logger.debug("Pass 3/3: Verifying duplicates with SHA-256 hash...")
            final_hash_map, sha_cache_hits = _find_duplicates_sha256(
                potential_duplicates, use_cache, progress_console, palette
            )
            total_cache_hits += sha_cache_hits
            results = _format_duplicate_results(path=str(path), hash_map=final_hash_map)

    elif mode == "quick":
        logger.warning("Using 'quick' mode. Results may include false positives.")
        logger.debug("Pass 2/2: Identifying potential duplicates with QUICK hash...")
        quick_hash_map, total_cache_hits = _find_duplicates_quick(candidate_files, use_cache, progress_console, palette)
        results = _format_duplicate_results(path=str(path), hash_map=quick_hash_map)

    elif mode == "sha256":
        logger.debug("Pass 2/2: Identifying duplicates with SHA-256 hash...")
        sha256_hash_map, total_cache_hits = _find_duplicates_sha256(
            candidate_files, use_cache, progress_console, palette
        )
        results = _format_duplicate_results(path=str(path), hash_map=sha256_hash_map)

    else:
        raise ValueError(f"Invalid mode '{mode}'. Must be one of 'hybrid', 'quick', or 'sha256'.")

    results["hashes_from_cache"] = total_cache_hits

    if results and results.get("total_sets", 0) > 0:
        total_duplicate_files = sum(s["count"] for s in results.get("duplicate_sets", []))
        logger.debug(
            "Duplicate search complete. Scanned %d files (%d inaccessible, %d hashes from cache). "
            "Found %d sets of duplicates, comprising %d total files.",
            total_scanned,
            inaccessible_count,
            total_cache_hits,
            results["total_sets"],
            total_duplicate_files,
        )
    else:
        logger.debug("Duplicate search complete. No duplicate sets were found.")

    return results

generate_html_directory_report

generate_html_directory_report(
    dir_path,
    output_path=None,
    *,
    local_collection=None,
    template="default",
    use_cache=True,
    recursive=False
)

Generates a self-contained HTML dashboard for a directory of files.

This function orchestrates the creation of a rich, interactive HTML document. It processes a directory into a LocalFileCollection, generates data for various UI panel based on user configuration, and renders the result using a flexible Jinja2 template system.

Parameters:

Name	Type	Description	Default
`dir_path`	`str`	The path to the local directory to report on.	required
`local_collection`	`LocalFileCollection`	An existing, pre-processed collection can be passed to avoid re-scanning the directory.	`None`
`output_path`	`str`	If provided, the HTML dashboard will be saved to this file path.	`None`
`template`	`str`	The name of the template to use. Defaults to "default".	`'default'`
`use_cache`	`bool`	Whether to use the local cache during file processing.	`True`
`recursive`	`bool`	Whether to scan the directory recursively.	`False`

Returns:

Name	Type	Description
`str`	`str \| None`	The generated HTML dashboard as a string.

Raises:

Type	Description
`DorsalError`	If file processing or report generation fails.
`TemplateNotFoundError`	If the specified template cannot be located.
`FileNotFoundError`	If the specified `dir_path` does not exist.

Source code in venv/lib/python3.13/site-packages/dorsal/api/file.py

def generate_html_directory_report(
    dir_path: str,
    output_path: str | None = None,
    *,
    local_collection: LocalFileCollection | None = None,
    template: str = "default",
    use_cache: bool = True,
    recursive: bool = False,
) -> str | None:
    """
    Generates a self-contained HTML dashboard for a directory of files.

    This function orchestrates the creation of a rich, interactive HTML document.
    It processes a directory into a LocalFileCollection, generates data for various
    UI panel based on user configuration, and renders the result using a
    flexible Jinja2 template system.

    Args:
        dir_path (str): The path to the local directory to report on.
        local_collection (LocalFileCollection, optional): An existing, pre-processed
            collection can be passed to avoid re-scanning the directory.
        output_path (str, optional): If provided, the HTML dashboard will be saved
            to this file path.
        template (str, optional): The name of the template to use. Defaults to "default".
        use_cache (bool, optional): Whether to use the local cache during file processing.
        recursive (bool, optional): Whether to scan the directory recursively.

    Returns:
        str: The generated HTML dashboard as a string.

    Raises:
        DorsalError: If file processing or report generation fails.
        TemplateNotFoundError: If the specified template cannot be located.
        FileNotFoundError: If the specified `dir_path` does not exist.
    """
    from jinja2 import Environment, FileSystemLoader
    from dorsal.common.config import get_collection_report_panel_config
    from dorsal.file.collection.local import LocalFileCollection
    from dorsal.file.utils.reports import REPORT_DATA_GENERATORS, resolve_template_path
    from dorsal.templates.file.icons import get_media_type_icon
    from dorsal.version import __version__
    import datetime
    import html
    import json
    import pathlib

    logger.debug(f"Generating HTML dashboard for: '{dir_path}' using template: '{template}'")
    try:
        if local_collection is None:
            collection = LocalFileCollection(
                source=dir_path,
                recursive=recursive,
                use_cache=use_cache,
            )
        else:
            collection = local_collection

        panel_config = get_collection_report_panel_config()
        enabled_panels = [name for name, is_enabled in panel_config.items() if is_enabled]

        panels_to_render = []
        for panel_id in enabled_panels:
            generator_func = REPORT_DATA_GENERATORS.get(panel_id)
            if generator_func:
                logger.debug(f"Generating data for panel: {panel_id}")
                panel_data = generator_func(collection)
                panels_to_render.append(
                    {
                        "id": panel_id,
                        "title": panel_id.replace("_", " ").title(),
                        "data": panel_data,
                    }
                )
            else:
                logger.warning(f"No data generator found for configured panel: {panel_id}")

        template_file, template_base_dir = resolve_template_path(report_type="collection", name_or_path=template)

        env = Environment(loader=FileSystemLoader(template_base_dir), autoescape=True)
        env.globals["human_filesize"] = human_filesize
        env.globals["get_media_type_icon"] = get_media_type_icon

        jinja_template = env.get_template(template_file.name)

        collection_dict = collection.to_dict()
        collection_dict["panels"] = panels_to_render

        full_collection_data_json = json.dumps(collection_dict, default=str)

        context = {
            "report_title": f"Directory Report: {html.escape(pathlib.Path(dir_path).name)}",
            "collection_source_path": dir_path,
            "generation_date": datetime.datetime.now(datetime.timezone.utc).strftime("%Y-%m-%d %H:%M:%S UTC"),
            "dorsal_version": __version__,
            "panels": panels_to_render,
            "full_collection_data_json": full_collection_data_json,
        }

        html_content = jinja_template.render(context)

        if output_path:
            output_file = pathlib.Path(output_path)
            output_file.parent.mkdir(parents=True, exist_ok=True)
            with open(output_file, "w", encoding="utf-8") as f:
                f.write(html_content)
            logger.info(f"HTML dashboard saved to: {output_path}")
            return None

        return html_content
    except Exception as e:
        logger.exception(f"Failed to generate HTML dashboard for '{dir_path}'.")
        if isinstance(e, (DorsalError, FileNotFoundError)):
            raise
        raise DorsalError(f"Could not generate HTML dashboard for {dir_path}: {e}") from e

generate_html_file_report

generate_html_file_report(
    file_path,
    *,
    local_file=None,
    output_path=None,
    template="default",
    use_cache=True,
    api_key=None
)

Generates a self-contained HTML report for a single local file.

This function serves as a high-level entry point to the reporting engine. It leverages scan_file to perform a full metadata extraction and then renders the result into a rich, interactive HTML document using a flexible, user-configurable Jinja2 template system. The final output is a single, portable HTML file with all CSS and JavaScript embedded.

Example

from dorsal.api import generate_html_file_report

# Generate the report and save it to a file
generate_html_file_report(
    "path/to/my_document.pdf",
    output_path="report.html"
)

# Generate a report using a custom template and get the HTML as a string
html_content = generate_html_file_report(
    "path/to/archive.zip",
    template="compact"
)

Parameters:

Name	Type	Description	Default
`file_path`	`str`	The path to the local file to report on.	required
`output_path`	`str`	If provided, the HTML report will be saved to this file path. Defaults to None.	`None`
`template`	`str`	The name of a built-in/user-defined template or an absolute path to a custom template .html file. Defaults to "default".	`'default'`
`use_cache`	`bool`	Whether to use the local cache during file processing. Defaults to True.	`True`
`api_key`	`str`	An API key for operations that may require it. Defaults to None.	`None`

Returns:

Name	Type	Description
`str`	`str \| None`	The generated HTML report as a string.

Raises:

Type	Description
`DorsalError`	If file processing or report generation fails.
`TemplateNotFoundError`	If the specified template cannot be located.
`FileNotFoundError`	If the specified `file_path` does not exist.

Source code in venv/lib/python3.13/site-packages/dorsal/api/file.py

def generate_html_file_report(
    file_path: str,
    *,
    local_file: LocalFile | None = None,
    output_path: str | None = None,
    template: str = "default",
    use_cache: bool = True,
    api_key: str | None = None,
) -> str | None:
    """
    Generates a self-contained HTML report for a single local file.

    This function serves as a high-level entry point to the reporting engine. It
    leverages `scan_file` to perform a full metadata extraction and then renders
    the result into a rich, interactive HTML document using a flexible,
    user-configurable Jinja2 template system. The final output is a single,
    portable HTML file with all CSS and JavaScript embedded.

    Example:
        ```python
        from dorsal.api import generate_html_file_report

        # Generate the report and save it to a file
        generate_html_file_report(
            "path/to/my_document.pdf",
            output_path="report.html"
        )

        # Generate a report using a custom template and get the HTML as a string
        html_content = generate_html_file_report(
            "path/to/archive.zip",
            template="compact"
        )
        ```

    Args:
        file_path (str): The path to the local file to report on.
        output_path (str, optional): If provided, the HTML report will be saved
            to this file path. Defaults to None.
        template (str, optional): The name of a built-in/user-defined template
            or an absolute path to a custom template .html file.
            Defaults to "default".
        use_cache (bool, optional): Whether to use the local cache during file
            processing. Defaults to True.
        api_key (str, optional): An API key for operations that may require it.
            Defaults to None.

    Returns:
        str: The generated HTML report as a string.

    Raises:
        DorsalError: If file processing or report generation fails.
        TemplateNotFoundError: If the specified template cannot be located.
        FileNotFoundError: If the specified `file_path` does not exist.
    """
    from jinja2 import Environment, FileSystemLoader
    from dorsal.templates.file.icons import get_media_type_icon
    from dorsal.version import __version__

    logger.debug(f"Generating HTML report for: '{file_path}' using template: '{template}'")
    try:
        if local_file is None:
            local_file = scan_file(file_path, use_cache=use_cache, api_key=api_key)

        template_file, template_base_dir = resolve_template_path(report_type="file", name_or_path=template)

        env = Environment(loader=FileSystemLoader(template_base_dir), autoescape=True)
        env.globals["human_filesize"] = human_filesize
        env.globals["get_media_type_icon"] = get_media_type_icon

        jinja_template = env.get_template(template_file.name)
        file_dict = local_file.to_dict(mode="json")

        base_info = file_dict.get("annotations", {}).get("file/base", {}).get("record", {})
        file_size_info = {
            "human": human_filesize(base_info.get("size", 0)),
            "raw": f"{base_info.get('size', 0)} bytes",
        }

        local_fs_info = {
            "full_path": local_file._file_path,
            "date_created": {
                "human": local_file.date_created.strftime("%Y-%m-%d %H:%M:%S"),
                "raw": local_file.date_created.isoformat(),
            },
            "date_modified": {
                "human": local_file.date_modified.strftime("%Y-%m-%d %H:%M:%S"),
                "raw": local_file.date_modified.isoformat(),
            },
        }

        context = {
            "report_title": f"Dorsal Report: {html.escape(base_info.get('name', 'Untitled File'))}",
            "generation_date": datetime.datetime.now(datetime.timezone.utc).strftime("%Y-%m-%d %H:%M:%S UTC"),
            "file": file_dict,
            "file_size": file_size_info,
            "raw_data_json": json.dumps(file_dict, indent=2, default=str),
            "local_filesystem_info": local_fs_info,
            "dorsal_version": __version__,
        }

        html_content = jinja_template.render(context)

        if output_path:
            output_file = pathlib.Path(output_path)
            output_file.parent.mkdir(parents=True, exist_ok=True)
            with open(output_file, "w", encoding="utf-8") as f:
                f.write(html_content)
            logger.info(f"HTML file report saved to: {output_path}")
            return None

        return html_content
    except Exception as e:
        logger.exception(f"Failed to generate HTML file report for '{file_path}'.")
        if isinstance(e, (DorsalError, FileNotFoundError)):
            raise
        raise DorsalError(f"Could not generate HTML report for {file_path}: {e}") from e

get_collection

get_collection(
    collection_id: str,
    hydrate: Literal[True],
    page: int = 1,
    per_page: int = 30,
    api_key: str | None = None,
    *,
    mode: Literal["pydantic"]
) -> HydratedSingleCollectionResponse

get_collection(
    collection_id: str,
    hydrate: Literal[True],
    page: int = 1,
    per_page: int = 30,
    api_key: str | None = None,
    *,
    mode: Literal["dict"]
) -> dict[str, Any]

get_collection(
    collection_id: str,
    hydrate: Literal[True],
    page: int = 1,
    per_page: int = 30,
    api_key: str | None = None,
    *,
    mode: Literal["json"]
) -> str

get_collection(
    collection_id: str,
    hydrate: Literal[False],
    page: int = 1,
    per_page: int = 30,
    api_key: str | None = None,
    *,
    mode: Literal["pydantic"]
) -> SingleCollectionResponse

get_collection(
    collection_id: str,
    hydrate: Literal[False],
    page: int = 1,
    per_page: int = 30,
    api_key: str | None = None,
    *,
    mode: Literal["dict"]
) -> dict[str, Any]

get_collection(
    collection_id: str,
    hydrate: Literal[False] = False,
    page: int = 1,
    per_page: int = 30,
    api_key: str | None = None,
    *,
    mode: Literal["json"]
) -> str

get_collection(
    collection_id,
    hydrate=False,
    page=1,
    per_page=30,
    api_key=None,
    *,
    mode="pydantic"
)

Retrieves a specific collection and its contents from DorsalHub.

Parameters:

Name	Type	Description	Default
`collection_id`	`str`	The unique ID of the collection to fetch.	required
`hydrate`	`bool`	If True, returns fully detailed file records. Defaults to False.	`False`
`page`	`int`	The page number for file contents. Defaults to 1.	`1`
`per_page`	`int`	The number of file records per page. Defaults to 30.	`30`
`api_key`	`str`	An API key for this request.	`None`
`mode`	`Literal['pydantic', 'dict', 'json']`	The desired return format.	`'pydantic'`

Returns:

Type	Description
`SingleCollectionResponse \| HydratedSingleCollectionResponse \| dict[str, Any] \| str`	The collection data, formatted according to the specified `mode`.

Raises:

Type	Description
`DorsalClientError`	For API errors like not found or authentication issues.
`DorsalError`	For other unexpected library errors.

Source code in venv/lib/python3.13/site-packages/dorsal/api/collection.py

def get_collection(
    collection_id: str,
    hydrate: bool = False,
    page: int = 1,
    per_page: int = 30,
    api_key: str | None = None,
    *,
    mode: Literal["pydantic", "dict", "json"] = "pydantic",
) -> "SingleCollectionResponse | HydratedSingleCollectionResponse | dict[str, Any] | str":
    """
    Retrieves a specific collection and its contents from DorsalHub.

    Args:
        collection_id (str): The unique ID of the collection to fetch.
        hydrate (bool): If True, returns fully detailed file records. Defaults to False.
        page (int): The page number for file contents. Defaults to 1.
        per_page (int): The number of file records per page. Defaults to 30.
        api_key (str, optional): An API key for this request.
        mode (Literal["pydantic", "dict", "json"]): The desired return format.

    Returns:
        The collection data, formatted according to the specified `mode`.

    Raises:
        DorsalClientError: For API errors like not found or authentication issues.
        DorsalError: For other unexpected library errors.
    """
    from dorsal.session import get_shared_dorsal_client
    from dorsal.client import DorsalClient

    effective_client: DorsalClient
    log_message_context = "using shared client"
    if api_key:
        log_message_context = "using temporary client with provided API key"
        effective_client = DorsalClient(api_key=api_key)
    else:
        effective_client = get_shared_dorsal_client()

    logger.debug(
        "Dispatching get_collection to client (%s) for ID: '%s', hydrate=%s",
        log_message_context,
        collection_id,
        hydrate,
    )
    response: SingleCollectionResponse | HydratedSingleCollectionResponse
    try:
        if hydrate:
            response = effective_client.get_collection(
                collection_id=collection_id,
                hydrate=True,
                page=page,
                per_page=per_page,
            )
        else:
            response = effective_client.get_collection(
                collection_id=collection_id,
                hydrate=False,
                page=page,
                per_page=per_page,
            )

        if mode == "pydantic":
            return response
        if mode == "dict":
            return response.model_dump(mode="json", by_alias=True, exclude_none=True)
        if mode == "json":
            return response.model_dump_json(indent=2, by_alias=True, exclude_none=True)

        raise ValueError(f"Invalid mode: '{mode}'.")

    except DorsalError as err:
        logger.warning(
            "A client error occurred during get_dorsal_collection (%s): %s",
            log_message_context,
            err,
        )
        raise
    except Exception as err:
        logger.exception(
            "An unexpected error occurred during get_dorsal_collection (%s).",
            log_message_context,
        )
        raise DorsalError(f"An unexpected error occurred while getting collection '{collection_id}': {err}") from err

get_dataset

get_dataset(dataset_id, api_key=None, client=None)

Retrieves the full definition of an existing dataset from DorsalHub.

Fetches a dataset's metadata, including its name, description, schema, and other properties.

Example

from dorsal.api import get_dataset

try:
    # Fetch a public dataset from the 'dorsal' namespace
    dataset = get_dataset("dorsal/arxiv-cs-papers")
    print(f"Dataset Name: {dataset.name}")
    print(f"Description: {dataset.description}")
except Exception as e:
    print(f"Could not retrieve dataset: {e}")

Parameters:

Name	Type	Description	Default
`dataset_id`	`str`	The unique identifier for the dataset, in the format "namespace/dataset-name".	required
`api_key`	`str`	An API key to use for this request, especially for private datasets. Defaults to None.	`None`

Returns:

Name	Type	Description
`Dataset`	`Dataset`	A Pydantic model instance representing the full dataset definition.

Raises:

Type	Description
`NotFoundError`	If no dataset with the specified ID is found.
`DorsalClientError`	If the API call fails for any other reason.

Source code in venv/lib/python3.13/site-packages/dorsal/api/dataset.py

def get_dataset(dataset_id: str, api_key: str | None = None, client: DorsalClient | None = None) -> Dataset:
    """Retrieves the full definition of an existing dataset from DorsalHub.

    Fetches a dataset's metadata, including its name, description, schema,
    and other properties.

    Example:
        ```python
        from dorsal.api import get_dataset

        try:
            # Fetch a public dataset from the 'dorsal' namespace
            dataset = get_dataset("dorsal/arxiv-cs-papers")
            print(f"Dataset Name: {dataset.name}")
            print(f"Description: {dataset.description}")
        except Exception as e:
            print(f"Could not retrieve dataset: {e}")
        ```

    Args:
        dataset_id (str): The unique identifier for the dataset, in the
            format "namespace/dataset-name".
        api_key (str, optional): An API key to use for this request, especially
            for private datasets. Defaults to None.

    Returns:
        Dataset: A Pydantic model instance representing the full dataset definition.

    Raises:
        NotFoundError: If no dataset with the specified ID is found.
        DorsalClientError: If the API call fails for any other reason.
    """
    logger.debug(
        "get_dataset called with id: '%s', api_key provided: %s",
        dataset_id,
        "Yes" if api_key else "No",
    )

    if client is None:
        try:
            client = get_shared_dorsal_client(api_key=api_key)
        except Exception as err:
            logger.exception("Failed to obtain shared DorsalClient instance.")
            raise DorsalClientError("Could not initialize or retrieve the API client.") from err

    try:
        dataset_response = client.get_dataset(dataset_id=dataset_id, api_key=api_key)
        logger.debug(
            "Successfully retrieved dataset ID: '%s'",
            dataset_id,
        )
        return dataset_response
    except (
        ValueError,
        DorsalClientError,
    ) as err:
        logger.warning(
            "API call to retrieve dataset (ID: '%s') failed: %s - %s",
            dataset_id,
            type(err).__name__,
            err,
        )
        raise
    except Exception as err:
        logger.exception(
            "Unexpected error during client.get_dataset (ID: '%s').",
            dataset_id,
        )
        raise DorsalClientError(
            f"An unexpected issue occurred while fetching dataset '{dataset_id}' via the client."
        ) from err

get_dataset_schema

get_dataset_schema(dataset_id, api_key=None, client=None)

Fetches the JSON schema for a given dataset.

Example

from dorsal.api import get_dataset_schema

try:
    schema = get_dataset_schema("dorsal/arxiv-cs-papers")
    print("Schema properties:")
    for prop in schema.get("properties", {}):
        print(f"- {prop}")
except Exception as e:
    print(f"Could not retrieve schema: {e}")

Parameters:

Name	Type	Description	Default
`dataset_id`	`str`	Identifier for a dataset (e.g., "dorsal/arxiv").	required
`api_key`	`str`	An API key for this request.	`None`

Returns:

Type	Description
`dict`	dict[str, Any]: The JSON schema of the dataset.

Raises:

Type	Description
`ValueError`	If `dataset_id` is invalid (propagated from `get_dataset`).
`DorsalClientError`	Base client error or for unexpected issues (propagated from `get_dataset`).
`AuthError`	Authentication failure (propagated from `get_dataset`).
`NotFoundError`	If the dataset is not found (propagated from `get_dataset`).
`ForbiddenError`	Access to the dataset is denied (propagated from `get_dataset`).
`RateLimitError`	If the request is rate-limited by the API (propagated from `get_dataset`).
`NetworkError`	If a network issue occurs (propagated from `get_dataset`).
`APIError`	For other HTTP errors from the API (propagated from `get_dataset`).
`ApiDataValidationError`	If the API response for the dataset is malformed and cannot be parsed into a valid `Dataset` object (propagated from `get_dataset`).

Source code in venv/lib/python3.13/site-packages/dorsal/api/dataset.py

def get_dataset_schema(dataset_id: str, api_key: str | None = None, client: DorsalClient | None = None) -> dict:
    """Fetches the JSON schema for a given dataset.

    Example:
        ```python
        from dorsal.api import get_dataset_schema

        try:
            schema = get_dataset_schema("dorsal/arxiv-cs-papers")
            print("Schema properties:")
            for prop in schema.get("properties", {}):
                print(f"- {prop}")
        except Exception as e:
            print(f"Could not retrieve schema: {e}")
        ```

    Args:
        dataset_id (str): Identifier for a dataset (e.g., "dorsal/arxiv").
        api_key (str, optional): An API key for this request.

    Returns:
        dict[str, Any]: The JSON schema of the dataset.

    Raises:
        ValueError: If `dataset_id` is invalid (propagated from `get_dataset`).
        DorsalClientError: Base client error or for unexpected issues (propagated
                           from `get_dataset`).
        AuthError: Authentication failure (propagated from `get_dataset`).
        NotFoundError: If the dataset is not found (propagated from `get_dataset`).
        ForbiddenError: Access to the dataset is denied (propagated from `get_dataset`).
        RateLimitError: If the request is rate-limited by the API (propagated from
                        `get_dataset`).
        NetworkError: If a network issue occurs (propagated from `get_dataset`).
        APIError: For other HTTP errors from the API (propagated from `get_dataset`).
        ApiDataValidationError: If the API response for the dataset is malformed and
                                cannot be parsed into a valid `Dataset` object
                                (propagated from `get_dataset`).
    """
    logger.debug(
        "Attempting to fetch schema for dataset_id: '%s'. API key used: %s",
        dataset_id,
        "Yes (user-provided)" if api_key else "No (client default)",
    )
    if client is None:
        try:
            client = get_shared_dorsal_client()
        except Exception as err:
            logger.exception("Failed to obtain shared DorsalClient instance.")
            raise DorsalClientError("Could not initialize or retrieve the API client.") from err

    try:
        schema = client.get_dataset_schema(dataset_id=dataset_id)
        logger.debug("Successfully retrieved schema via client for dataset_id: '%s'", dataset_id)
        return schema
    except (ValueError, DorsalClientError) as err:
        logger.warning(
            "Client call to retrieve schema for dataset_id '%s' failed: %s - %s",
            dataset_id,
            type(err).__name__,
            err,
        )
        raise
    except Exception as err:
        logger.exception(
            "Unexpected error during client.get_dataset_schema call for dataset_id: '%s'.",
            dataset_id,
        )
        raise DorsalClientError(
            message=f"An unexpected issue occurred while fetching schema for dataset '{dataset_id}' via the client."
        ) from err

get_directory_info

get_directory_info(
    dir_path,
    recursive=False,
    media_type=True,
    progress_console=None,
    palette=None,
)

Calculates and returns a detailed summary of a directory using a single-pass method.

Source code in venv/lib/python3.13/site-packages/dorsal/api/file.py

def get_directory_info(
    dir_path: str,
    recursive: bool = False,
    media_type: bool = True,
    progress_console: "Console" | None = None,
    palette: dict[str, str] | None = None,
) -> _DirectoryInfoResult:
    """
    Calculates and returns a detailed summary of a directory using a single-pass method.
    """
    if not os.path.isdir(dir_path):
        raise NotADirectoryError(f"The specified path is not a directory: {dir_path}")

    metrics = _DirectoryMetrics(media_type_enabled=media_type)
    start_time = time.perf_counter()

    tqdm_bar = None
    rich_progress = None
    task_id = None

    if is_jupyter_environment():
        from tqdm import tqdm

        tqdm_bar = tqdm(desc="Analyzing directory", unit="file")
    elif progress_console:
        rich_progress = _create_rich_progress(progress_console, palette)
        task_id = rich_progress.add_task("Analyzing directory...", total=None)

    progress_manager = rich_progress if rich_progress else open(os.devnull, "w")
    with progress_manager:
        for root, dirs, files in os.walk(dir_path, topdown=True):
            metrics.total_dirs += len(dirs)

            for name in files:
                metrics.total_files += 1

                if rich_progress and task_id is not None:
                    rich_progress.update(task_id, advance=1)
                elif tqdm_bar:
                    tqdm_bar.update(1)

                metrics.process_file(pathlib.Path(root) / name)

            if not recursive:
                dirs.clear()

    if tqdm_bar:
        tqdm_bar.close()

    duration = time.perf_counter() - start_time
    logger.info("Directory analysis for '%s' completed in %.2f seconds.", dir_path, duration)

    return _format_results(metrics, duration)

get_dorsal_file_record

get_dorsal_file_record(
    hash_string: str,
    mode: Literal["pydantic"],
    private: bool | None = None,
    api_key: str | None = None,
) -> "FileRecord"

get_dorsal_file_record(
    hash_string: str,
    mode: Literal["dict"],
    private: bool | None = None,
    api_key: str | None = None,
) -> dict[str, Any]

get_dorsal_file_record(
    hash_string: str,
    mode: Literal["json"],
    private: bool | None = None,
    api_key: str | None = None,
) -> str

get_dorsal_file_record(
    hash_string, mode="pydantic", private=None, api_key=None
)

Gets metadata for a file record from DorsalHub

private=None (Default): check for a public record first, and if not found, check for a private one.
private=True: get public record
private=False: get private record

Example

from dorsal.api import get_file_metadata

# Agnostic search (recommended)
agnostic_file = get_file_metadata("some_hash")

# Public-only search
public_file = get_file_metadata("some_hash", private=False)

# Private-only search as a dictionary
private_file_dict = get_file_metadata("some_hash", private=True, mode="dict")

Parameters:

Name	Type	Description	Default
`hash_string`	`str`	The hash of the file to fetch (e.g., "sha256:...").	required
`mode`	`Literal['pydantic', 'dict', 'json']`	The desired return format. Defaults to "pydantic", returning a `DorsalFile` object.	`'pydantic'`
`private`	`Optional[bool]`	Controls the search visibility. Defaults to None (agnostic search).	`None`
`api_key`	`str`	An API key to use for this request, overriding any globally configured key. Defaults to None.	`None`

Source code in venv/lib/python3.13/site-packages/dorsal/api/file.py

def get_dorsal_file_record(
    hash_string: str,
    mode: Literal["pydantic", "dict", "json"] = "pydantic",
    private: bool | None = None,
    api_key: str | None = None,
) -> "FileRecord | dict[str, Any] | str":
    """
    Gets metadata for a file record from DorsalHub

    - `private=None` (Default): check for a public record first, and if not found, check for a private one.
    - `private=True`: get public record
    - `private=False`: get private record

    Example:
        ```python
        from dorsal.api import get_file_metadata

        # Agnostic search (recommended)
        agnostic_file = get_file_metadata("some_hash")

        # Public-only search
        public_file = get_file_metadata("some_hash", private=False)

        # Private-only search as a dictionary
        private_file_dict = get_file_metadata("some_hash", private=True, mode="dict")
        ```

    Args:
        hash_string (str): The hash of the file to fetch (e.g., "sha256:...").
        mode (Literal["pydantic", "dict", "json"], optional): The desired return
            format. Defaults to "pydantic", returning a `DorsalFile` object.
        private (Optional[bool], optional): Controls the search visibility.
            Defaults to None (agnostic search).
        api_key (str, optional): An API key to use for this request, overriding
            any globally configured key. Defaults to None.

    """
    from dorsal.session import get_shared_dorsal_client

    search_strategy = (
        "Agnostic (Private, then Public)" if private is None else ("Private-only" if private else "Public-only")
    )
    log_message_context = "using default client"

    effective_client = get_shared_dorsal_client()
    if api_key:
        from dorsal.client import DorsalClient

        log_message_context = "using temporary client with provided API key"
        effective_client = DorsalClient(api_key=api_key)

    logger.debug(
        "Getting file metadata for hash: '%s'. Mode: %s, Search: %s, %s",
        hash_string,
        mode,
        search_strategy,
        log_message_context,
    )

    try:
        cleaned_hash_string = hash_string.strip() if isinstance(hash_string, str) else ""
        if not cleaned_hash_string:
            raise ValueError("hash_string must be a non-empty string.")

        file_record = effective_client.download_file_record(
            hash_string=cleaned_hash_string,
            private=private,
        )

        if mode == "pydantic":
            return file_record
        if mode == "dict":
            return file_record.model_dump(mode="json", by_alias=True, exclude_none=True)
        if mode == "json":
            return file_record.model_dump_json(indent=2, by_alias=True, exclude_none=True)

        raise ValueError(f"Invalid mode: '{mode}'.")

    except (TypeError, ValueError) as err:
        logger.warning(
            "Input validation error in get_dorsal_file_record (hash: '%s', search: %s, %s): %s",
            hash_string,
            search_strategy,
            log_message_context,
            err,
        )
        raise
    except DorsalClientError as err:
        if isinstance(err.original_exception, NotFoundError):
            err.message = f"File not found in '{search_strategy}' scope for hash '{cleaned_hash_string}'."

        logger.warning(
            "DorsalClientError during get_dorsal_file_record (hash: '%s', search: %s, %s): %s",
            hash_string,
            search_strategy,
            log_message_context,
            err,
        )
        raise
    except Exception as err:
        logger.exception(
            "Unexpected error in get_dorsal_file_record for hash '%s' (%s).",
            hash_string,
            log_message_context,
        )
        if isinstance(err, DorsalError):
            raise
        raise DorsalError(f"An unexpected error occurred while getting metadata for hash '{hash_string}'.") from err

identify_file

identify_file(
    file_path: str,
    quick: bool = True,
    file_size: int | None = None,
    *,
    mode: Literal["pydantic"],
    api_key: str | None = None,
    use_cache: bool = True
) -> "FileRecord"

identify_file(
    file_path: str,
    quick: bool = True,
    file_size: int | None = None,
    *,
    mode: Literal["dict"],
    api_key: str | None = None,
    use_cache: bool = True
) -> dict[str, Any]

identify_file(
    file_path: str,
    quick: bool = True,
    file_size: int | None = None,
    *,
    mode: Literal["json"],
    api_key: str | None = None,
    use_cache: bool = True
) -> str

identify_file(
    file_path,
    quick=True,
    file_size=None,
    *,
    mode="pydantic",
    api_key=None,
    use_cache=True
)

Gets metadata for a local file from DorsalHub using its content hash.

This function identifies a file by first calculating its hash locally and then querying the remote API for a matching record. It provides an efficient "quick hash" option for large files and falls back to a secure SHA-256 hash. Uses local cache to avoid re-calculating hashes on subsequent calls to same file path.

Example

from dorsal.api import identify_file

try:
    # Identify a file, using the fast "quick hash" if possible
    record = identify_file("path/to/my_video.mp4", mode="dict")
    print(f"Successfully identified '{record['name']}'")
    print(f"Dorsal URL: {record['url']}")

except FileNotFoundError:
    print("Error: The file could not be found at that path.")
except DorsalClientError as e:
    # Catches errors like record not found on the server
    print(f"API Error: {e.message}")

Parameters:

Name	Type	Description	Default
`file_path`	`str`	The path to the local file to identify.	required
`quick`	`bool`	If True, attempts to use the faster "quick hash" for files >= 32MiB. Defaults to True.	`True`
`file_size`	`int`	An optional pre-calculated file size in bytes. If not provided, it will be calculated. Defaults to None.	`None`
`mode`	`Literal['pydantic', 'dict', 'json']`	The desired return format. Defaults to "pydantic".	`'pydantic'`
`api_key`	`str`	An API key for this request, overriding the client's default. Defaults to None.	`None`

Returns:

Type	Description
`FileRecord \| dict[str, Any] \| str`	Union[FileRecord, dict, str]: The file record from DorsalHub, formatted according to the specified `mode`.

Raises:

Type	Description
`FileNotFoundError`	If the file does not exist at the specified path.
`DorsalClientError`	For API-level errors, such as `NotFoundError` if no record matches the hash.
`DorsalError`	For other unexpected library errors.

Source code in venv/lib/python3.13/site-packages/dorsal/api/file.py

def identify_file(
    file_path: str,
    quick: bool = True,
    file_size: int | None = None,
    *,
    mode: Literal["pydantic", "dict", "json"] = "pydantic",
    api_key: str | None = None,
    use_cache: bool = True,
) -> FileRecord | dict[str, Any] | str:
    """Gets metadata for a local file from DorsalHub using its content hash.

    This function identifies a file by first calculating its hash locally and then
    querying the remote API for a matching record. It provides an efficient
    "quick hash" option for large files and falls back to a secure SHA-256 hash.
    Uses local cache to avoid re-calculating hashes on subsequent calls to same file path.

    Example:
        ```python
        from dorsal.api import identify_file

        try:
            # Identify a file, using the fast "quick hash" if possible
            record = identify_file("path/to/my_video.mp4", mode="dict")
            print(f"Successfully identified '{record['name']}'")
            print(f"Dorsal URL: {record['url']}")

        except FileNotFoundError:
            print("Error: The file could not be found at that path.")
        except DorsalClientError as e:
            # Catches errors like record not found on the server
            print(f"API Error: {e.message}")
        ```

    Args:
        file_path (str): The path to the local file to identify.
        quick (bool, optional): If True, attempts to use the faster "quick hash"
            for files >= 32MiB. Defaults to True.
        file_size (int, optional): An optional pre-calculated file size in bytes.
            If not provided, it will be calculated. Defaults to None.
        mode (Literal["pydantic", "dict", "json"], optional): The desired return
            format. Defaults to "pydantic".
        api_key (str, optional): An API key for this request, overriding the
            client's default. Defaults to None.

    Returns:
        Union[FileRecord, dict, str]: The file record from DorsalHub, formatted
            according to the specified `mode`.

    Raises:
        FileNotFoundError: If the file does not exist at the specified path.
        DorsalClientError: For API-level errors, such as `NotFoundError` if
            no record matches the hash.
        DorsalError: For other unexpected library errors.
    """
    from dorsal.session import get_shared_dorsal_client
    from dorsal.client import DorsalClient

    log_message_context = "using default client"
    effective_client: DorsalClient = get_shared_dorsal_client()
    if api_key:
        log_message_context = "using temporary client with provided API key"
        effective_client = DorsalClient(api_key=api_key)

    logger.debug(
        "Identifying file record for path: '%s' (quick=%s, %s)",
        file_path,
        quick,
        log_message_context,
    )

    try:
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"File not found at path: {file_path}")

        file_record = None
        secure_hash_key = ""
        cache = get_shared_cache() if use_cache else None

        if quick:
            if file_size is None:
                file_size = get_filesize(file_path=file_path)

            if file_size >= QuickHasher.min_permitted_filesize:
                quick_hash_val = None
                if cache:
                    quick_hash_val = get_cached_hash(
                        file_path=file_path,
                        cache=cache,
                        hash_callable=lambda p: get_quick_hash(p, fallback_to_sha256=False, file_size=file_size),
                        hash_function="QUICK",
                    )
                else:
                    quick_hash_val = get_quick_hash(
                        file_path=file_path,
                        fallback_to_sha256=False,
                        file_size=file_size,
                    )

                if quick_hash_val:
                    quick_hash_key = f"QUICK:{quick_hash_val}"
                    logger.debug(
                        "Attempting to identify file with Quick Hash: %s",
                        quick_hash_key,
                    )
                    try:
                        file_record = effective_client.download_file_record(hash_string=quick_hash_key)
                    except ConflictError:
                        logger.warning(
                            "Quick Hash collision for '%s'. Falling back to SHA-256.",
                            quick_hash_key,
                        )
                    except NotFoundError:
                        logger.debug("Quick Hash record not found. Falling back to SHA-256.")
            else:
                logger.debug("File size is less than 32MiB. Skipping Quick Hash.")

        if file_record is None:
            secure_hash_val = None
            if cache:
                secure_hash_val = get_cached_hash(
                    file_path=file_path,
                    cache=cache,
                    hash_callable=get_sha256_hash,
                    hash_function="SHA-256",
                )
            else:
                secure_hash_val = get_sha256_hash(file_path=file_path)

            if not secure_hash_val:
                raise DorsalError(f"Could not generate SHA-256 hash for file: {file_path}")

            secure_hash_key = f"SHA-256:{secure_hash_val}"
            logger.debug("Attempting to identify file with Secure Hash: %s", secure_hash_key)
            file_record = effective_client.download_file_record(hash_string=secure_hash_key)

        if mode == "dict":
            return file_record.model_dump(mode="json", by_alias=True, exclude_none=True)
        if mode == "json":
            return file_record.model_dump_json(indent=2, by_alias=True, exclude_none=True)
        if mode != "pydantic":
            logger.debug("Invalid mode '%s' specified. Returning default model.", mode)  # type: ignore[unreachable]
        return file_record

    except DorsalClientError as err:
        if isinstance(err.original_exception, NotFoundError):
            hash_key = secure_hash_key or "the file's hash"
            err.message = f"No file record was found on DorsalHub matching {hash_key}."
        logger.debug("A client error occurred during identify_file for '%s': %s", file_path, err)
        raise
    except (FileNotFoundError, ValueError) as err:
        logger.error(
            "An input or file system error occurred during identify_file for '%s': %s",
            file_path,
            err,
        )
        raise
    except Exception as err:
        logger.exception("An unexpected error occurred during identify_file for '%s'.", file_path)
        if isinstance(err, DorsalError):
            raise
        raise DorsalError(f"An unexpected error occurred while identifying file '{file_path}'.") from err

index_directory

index_directory(
    dir_path,
    recursive=False,
    *,
    private=True,
    api_key=None,
    use_cache=True
)

Scans a directory and indexes all files to DorsalHub.

This is a powerful, one-shot function that performs a complete workflow: 1. Scans the specified directory for files. 2. Generates rich metadata for each file locally. 3. Uploads all generated metadata records to DorsalHub in managed batches.

Example

from dorsal.api import index_directory

# Scan a directory and index all files to your private records
summary = index_directory("path/to/project_assets", recursive=True, private=True)

print("--- Indexing Complete ---")
print(f"Files processed locally: {summary['total_records_processed_locally']}")
print(f"Successfully indexed to API: {summary['total_records_accepted_by_api']}")

Parameters:

Name	Type	Description	Default
`dir_path`	`str`	The path to the directory you want to scan and index.	required
`recursive`	`bool`	If True, scans all subdirectories recursively. Defaults to False.	`False`
`private`	`bool`	If True, all file records will be created as private on DorsalHub. Defaults to True.	`True`
`api_key`	`str \| None`	An API key to use for this operation, overriding the client's default. Defaults to None.	`None`

Returns:

Name	Type	Description
`dict`	`dict`	A summary dictionary detailing the results of the entire batch operation, including local processing and API indexing counts.

Source code in venv/lib/python3.13/site-packages/dorsal/api/file.py

def index_directory(
    dir_path: str,
    recursive: bool = False,
    *,
    private: bool = True,
    api_key: str | None = None,
    use_cache: bool = True,
) -> dict:
    """Scans a directory and indexes all files to DorsalHub.

    This is a powerful, one-shot function that performs a complete workflow:
    1. Scans the specified directory for files.
    2. Generates rich metadata for each file locally.
    3. Uploads all generated metadata records to DorsalHub in managed batches.

    Example:
        ```python
        from dorsal.api import index_directory

        # Scan a directory and index all files to your private records
        summary = index_directory("path/to/project_assets", recursive=True, private=True)

        print("--- Indexing Complete ---")
        print(f"Files processed locally: {summary['total_records_processed_locally']}")
        print(f"Successfully indexed to API: {summary['total_records_accepted_by_api']}")
        ```

    Args:
        dir_path (str): The path to the directory you want to scan and index.
        recursive (bool, optional): If True, scans all subdirectories
            recursively. Defaults to False.
        private (bool, optional): If True, all file records will be created
            as private on DorsalHub. Defaults to True.
        api_key (str | None, optional): An API key to use for this operation,
            overriding the client's default. Defaults to None.

    Returns:
        dict: A summary dictionary detailing the results of the entire batch
            operation, including local processing and API indexing counts.
    """
    from dorsal.file.metadata_reader import MetadataReader

    effective_reader: MetadataReader
    log_message_context = ""

    if api_key is not None:
        log_message_context = "using provided API key with temporary MetadataReader"
        logger.debug(
            "API key override for index_directory (dir: '%s'). Creating temporary MetadataReader.",
            dir_path,
        )
        effective_reader = MetadataReader(api_key=api_key)
    else:
        log_message_context = "using shared METADATA_READER instance"
        logger.debug(
            "No API key override for index_directory (dir: '%s'). Using shared METADATA_READER.",
            dir_path,
        )
        effective_reader = get_metadata_reader()

    logger.debug(
        "High-level index_directory: dir_path='%s' (%s), recursive=%s, private=%s.",
        dir_path,
        log_message_context,
        recursive,
        private,
    )

    all_records_to_index: list[FileRecordStrict]
    file_hash_to_path_map: dict[str, str]

    try:
        logger.debug(
            "Step 1: Generating file records from directory '%s' via MetadataReader.",
            dir_path,
        )
        all_records_to_index, file_hash_to_path_map = effective_reader.generate_processed_records_from_directory(
            dir_path=dir_path, recursive=recursive, skip_cache=not use_cache
        )
        total_records_processed_locally = len(all_records_to_index)
        logger.debug(
            "MetadataReader generated %d unique file records from directory '%s'.",
            total_records_processed_locally,
            dir_path,
        )
    except DorsalError as err:
        logger.warning(
            "Failed to generate file records from directory '%s' (%s): %s - %s",
            dir_path,
            log_message_context,
            type(err).__name__,
            err,
        )
        raise
    except Exception as err:
        logger.exception(
            "Unexpected error generating file records from directory '%s' (%s).",
            dir_path,
            log_message_context,
        )
        if isinstance(err, DorsalError):
            raise
        raise DorsalError(f"Unexpected error processing directory '{dir_path}': {err}") from err

    if not all_records_to_index:
        logger.debug("No unique file records generated from directory '%s' to index.", dir_path)
        return {
            "total_records_processed_locally": 0,
            "total_batches_created": 0,
            "successful_api_batches": 0,
            "failed_api_batches": 0,
            "total_records_accepted_by_api": 0,
            "batch_processing_details": [],
        }

    batches = [
        all_records_to_index[i : i + constants.API_MAX_BATCH_SIZE]
        for i in range(0, total_records_processed_locally, constants.API_MAX_BATCH_SIZE)
    ]
    total_batches_created = len(batches)

    logger.debug(
        "Submitting %d records in %d batches of up to %d each to DorsalHub (private=%s, context: %s).",
        total_records_processed_locally,
        total_batches_created,
        constants.API_MAX_BATCH_SIZE,
        private,
        log_message_context,
    )

    successful_api_batches_count = 0
    failed_api_batches_count = 0
    total_records_accepted_by_api = 0
    batch_processing_details_list = []

    for i, current_batch_records in enumerate(batches):
        batch_number = i + 1
        records_in_this_batch = len(current_batch_records)
        batch_detail_entry: dict = {
            "batch_number": batch_number,
            "records_in_batch": records_in_this_batch,
            "status": "failure",
            "api_response": None,
            "error_message": None,
            "error_type": None,
        }

        logger.debug(
            "Submitting API batch %d of %d (%d records) for directory '%s'.",
            batch_number,
            total_batches_created,
            records_in_this_batch,
            dir_path,
        )
        try:
            batch_api_response: FileIndexResponse
            if private:
                batch_api_response = effective_reader._client.index_private_file_records(
                    file_records=current_batch_records
                )
            else:
                batch_api_response = effective_reader._client.index_public_file_records(
                    file_records=current_batch_records
                )

            if file_hash_to_path_map and batch_api_response.results:
                for result_item in batch_api_response.results:
                    if hasattr(result_item, "hash") and hasattr(result_item, "file_path"):
                        path = file_hash_to_path_map.get(result_item.hash)
                        if path:
                            result_item.file_path = path

            batch_detail_entry["status"] = "success"
            batch_detail_entry["api_response"] = batch_api_response
            successful_api_batches_count += 1
            total_records_accepted_by_api += batch_api_response.success

            logger.debug(
                "API Batch %d of %d for directory '%s' submitted successfully. API Response: Total=%d, Success=%d.",
                batch_number,
                total_batches_created,
                dir_path,
                batch_api_response.total,
                batch_api_response.success,
            )
        except DorsalError as err:
            logger.warning(
                "API Batch %d of %d for directory '%s' failed: %s - %s",
                batch_number,
                total_batches_created,
                dir_path,
                type(err).__name__,
                err,
            )
            batch_detail_entry["error_message"] = str(err)
            batch_detail_entry["error_type"] = type(err).__name__
            failed_api_batches_count += 1
        except Exception as err:
            logger.exception(
                "Unexpected error submitting API batch %d of %d for directory '%s'.",
                batch_number,
                total_batches_created,
                dir_path,
            )
            batch_detail_entry["error_message"] = f"Unexpected error: {str(err)}"
            batch_detail_entry["error_type"] = type(err).__name__
            failed_api_batches_count += 1

        batch_processing_details_list.append(batch_detail_entry)

    overall_summary = {
        "total_records_processed_locally": total_records_processed_locally,
        "total_batches_created": total_batches_created,
        "successful_api_batches": successful_api_batches_count,
        "failed_api_batches": failed_api_batches_count,
        "total_records_accepted_by_api": total_records_accepted_by_api,
        "batch_processing_details": batch_processing_details_list,
    }

    if failed_api_batches_count > 0:
        logger.warning(
            "Batch indexing for directory '%s' (%s) completed with %d successful and %d failed API batches (out of %d). "
            "Total records accepted by API in successful batches: %d.",
            dir_path,
            log_message_context,
            successful_api_batches_count,
            failed_api_batches_count,
            total_batches_created,
            total_records_accepted_by_api,
        )
    else:
        logger.debug(
            "Batch indexing for directory '%s' (%s) completed successfully. All %d API batches processed. "
            "Total records accepted by API: %d.",
            dir_path,
            log_message_context,
            total_batches_created,
            total_records_accepted_by_api,
        )
    return overall_summary

index_file

index_file(
    file_path, *, private=True, api_key=None, use_cache=True
)

Processes a single local file and uploads its metadata to DorsalHub.

This function provides a simple, one-shot way to get a local file's metadata indexed on the remote server.

Example

from dorsal.api import index_file

try:
    response = index_file("path/to/my_image.jpg", private=True)
    if response.success > 0:
        print("File indexed successfully!")
        print(f"View at: {response.results[0].url}")
except Exception as e:
    print(f"Failed to index file: {e}")

Parameters:

Name	Type	Description	Default
`file_path`	`str`	The path to the local file to process and index.	required
`private`	`bool`	If True, the record will be created as private. Defaults to True.	`True`
`api_key`	`str`	An API key to use for this specific request. Defaults to None.	`None`

Returns:

Name	Type	Description
`FileIndexResponse`	`FileIndexResponse`	A response object from the API detailing the result of the indexing operation.

Source code in venv/lib/python3.13/site-packages/dorsal/api/file.py

def index_file(
    file_path: str,
    *,
    private: bool = True,
    api_key: str | None = None,
    use_cache: bool = True,
) -> FileIndexResponse:
    """Processes a single local file and uploads its metadata to DorsalHub.

    This function provides a simple, one-shot way to get a local file's
    metadata indexed on the remote server.

    Example:
        ```python
        from dorsal.api import index_file

        try:
            response = index_file("path/to/my_image.jpg", private=True)
            if response.success > 0:
                print("File indexed successfully!")
                print(f"View at: {response.results[0].url}")
        except Exception as e:
            print(f"Failed to index file: {e}")
        ```

    Args:
        file_path (str): The path to the local file to process and index.
        private (bool, optional): If True, the record will be created as private.
            Defaults to True.
        api_key (str, optional): An API key to use for this specific request.
            Defaults to None.

    Returns:
        FileIndexResponse: A response object from the API detailing the
            result of the indexing operation.
    """
    from dorsal.file.metadata_reader import MetadataReader

    metadata_reader: MetadataReader
    log_message_context = ""

    if api_key is not None:
        log_message_context = (
            "using provided API key with temporary MetadataReader (default model/ignore_duplicates settings)"
        )
        logger.debug(
            "API key override provided for index_file (file: '%s'). Creating temporary MetadataReader instance.",
            file_path,
        )
        metadata_reader = MetadataReader(api_key=api_key)

    else:
        log_message_context = "using shared METADATA_READER instance"
        logger.debug(
            "No API key override for index_file (file: '%s'). Using shared METADATA_READER instance.",
            file_path,
        )
        metadata_reader = get_metadata_reader()

    logger.debug(
        "High-level index_file calling effective MetadataReader for file_path='%s' (%s), private=%s.",
        file_path,
        log_message_context,
        private,
    )

    try:
        response = metadata_reader.index_file(file_path=file_path, private=private, skip_cache=not use_cache)
        logger.debug(
            "Effective MetadataReader.index_file completed for file_path='%s'. Response success: %s",
            file_path,
            response.success if hasattr(response, "success") else "N/A",
        )
        return response
    except (FileNotFoundError, IOError, DorsalError) as err:
        logger.warning(
            "Call to effective MetadataReader.index_file for file_path='%s' (%s) failed: %s - %s",
            file_path,
            log_message_context,
            type(err).__name__,
            err,
        )
        raise
    except Exception as err:
        logger.exception(
            "Unexpected error in high-level index_file for file_path='%s' (%s).",
            file_path,
            log_message_context,
        )
        if isinstance(err, DorsalError):
            raise
        raise DorsalError(
            f"An unexpected error occurred while indexing file '{file_path}' ({log_message_context})."
        ) from err

list_collections

list_collections(
    page: int = 1,
    per_page: int = 25,
    api_key: str | None = None,
    *,
    mode: Literal["pydantic"]
) -> CollectionsResponse

list_collections(
    page: int = 1,
    per_page: int = 25,
    api_key: str | None = None,
    *,
    mode: Literal["dict"]
) -> dict[str, Any]

list_collections(
    page: int = 1,
    per_page: int = 25,
    api_key: str | None = None,
    *,
    mode: Literal["json"]
) -> str

list_collections(
    page=1, per_page=25, api_key=None, *, mode="pydantic"
)

Retrieves a paginated list of collections from DorsalHub.

This is a high-level wrapper around the DorsalClient's list_collections method, providing a simple way to access collection information.

Example

from dorsal.api.collection import list_dorsal_collections

# Get the first page of collections as Pydantic objects
response = list_dorsal_collections()
print(f"Found {response.pagination.record_count} total collections.")
for collection in response.results:
    print(f"- {collection.name} (ID: {collection.id})")

# Get the second page as a JSON string
response_json = list_dorsal_collections(page=2, mode="json")
print(response_json)

Parameters:

Name	Type	Description	Default
`page`	`int`	The page number for pagination. Defaults to 1.	`1`
`per_page`	`int`	The number of collections per page. Defaults to 25.	`25`
`api_key`	`str`	An API key for this request, overriding the client's default. Defaults to None.	`None`
`mode`	`Literal['pydantic', 'dict', 'json']`	The desired return format. Defaults to "pydantic".	`'pydantic'`

Returns:

Type	Description
`CollectionsResponse \| dict[str, Any] \| str`	Union[CollectionsResponse, dict, str]: The search results, formatted according to the specified `mode`.

Raises:

Type	Description
`DorsalClientError`	For client-side validation errors or API errors like authentication or rate limiting.
`DorsalError`	For other unexpected library errors.

Source code in venv/lib/python3.13/site-packages/dorsal/api/collection.py

def list_collections(
    page: int = 1,
    per_page: int = 25,
    api_key: str | None = None,
    *,
    mode: Literal["pydantic", "dict", "json"] = "pydantic",
) -> "CollectionsResponse | dict[str, Any] | str":
    """
    Retrieves a paginated list of collections from DorsalHub.

    This is a high-level wrapper around the DorsalClient's list_collections
    method, providing a simple way to access collection information.

    Example:
        ```python
        from dorsal.api.collection import list_dorsal_collections

        # Get the first page of collections as Pydantic objects
        response = list_dorsal_collections()
        print(f"Found {response.pagination.record_count} total collections.")
        for collection in response.results:
            print(f"- {collection.name} (ID: {collection.id})")

        # Get the second page as a JSON string
        response_json = list_dorsal_collections(page=2, mode="json")
        print(response_json)
        ```

    Args:
        page (int): The page number for pagination. Defaults to 1.
        per_page (int): The number of collections per page. Defaults to 25.
        api_key (str, optional): An API key for this request, overriding the
            client's default. Defaults to None.
        mode (Literal["pydantic", "dict", "json"]): The desired return format.
            Defaults to "pydantic".

    Returns:
        Union[CollectionsResponse, dict, str]: The search results, formatted
            according to the specified `mode`.

    Raises:
        DorsalClientError: For client-side validation errors or API errors
            like authentication or rate limiting.
        DorsalError: For other unexpected library errors.
    """
    from dorsal.session import get_shared_dorsal_client
    from dorsal.client import DorsalClient

    effective_client: DorsalClient
    log_message_context = ""
    if api_key:
        log_message_context = "using temporary client with provided API key"
        logger.debug("API key override provided for list_collections. Creating temporary DorsalClient.")
        effective_client = DorsalClient(api_key=api_key)
    else:
        log_message_context = "using shared client"
        logger.debug("No API key override for list_collections. Using shared client instance.")
        effective_client = get_shared_dorsal_client()

    logger.debug(
        "Dispatching list_collections to client (%s) with params: page=%d, per_page=%d",
        log_message_context,
        page,
        per_page,
    )

    try:
        response = effective_client.list_collections(
            page=page,
            per_page=per_page,
        )

        logger.debug(
            "Collection list successful. Returned page %d of %d, with %d records.",
            response.pagination.current_page,
            response.pagination.page_count,
            len(response.records),
        )

        if mode == "pydantic":
            return response
        if mode == "dict":
            return response.model_dump(mode="json", by_alias=True, exclude_none=True)
        if mode == "json":
            return response.model_dump_json(indent=2, by_alias=True, exclude_none=True)

        raise ValueError(f"Invalid mode: '{mode}'. Must be one of 'pydantic', 'dict', or 'json'.")

    except DorsalError as err:
        logger.warning(
            "A client error occurred during list_dorsal_collections (%s): %s - %s",
            log_message_context,
            type(err).__name__,
            err,
        )
        raise
    except Exception as err:
        logger.exception(
            "An unexpected error occurred during list_dorsal_collections (%s).",
            log_message_context,
        )
        raise DorsalError(f"An unexpected error occurred while listing collections: {err}") from err

make_collection_private

make_collection_private(collection_id, api_key=None)

Makes a public collection private.

This is a high-level wrapper that handles client instantiation and error logging.

Parameters:

Name	Type	Description	Default
`collection_id`	`str`	The unique ID of the collection to make private.	required
`api_key`	`str`	An API key for this request.	`None`

Returns:

Name	Type	Description
`CollectionWebLocationResponse`	`CollectionWebLocationResponse`	An object containing the new private web URL.

Raises:

Type	Description
`DorsalClientError`	For API errors (e.g., ConflictError if already private).
`DorsalError`	For other unexpected library errors.

Source code in venv/lib/python3.13/site-packages/dorsal/api/collection.py

def make_collection_private(
    collection_id: str,
    api_key: str | None = None,
) -> "CollectionWebLocationResponse":
    """
    Makes a public collection private.

    This is a high-level wrapper that handles client instantiation and error logging.

    Args:
        collection_id (str): The unique ID of the collection to make private.
        api_key (str, optional): An API key for this request.

    Returns:
        CollectionWebLocationResponse: An object containing the new private web URL.

    Raises:
        DorsalClientError: For API errors (e.g., ConflictError if already private).
        DorsalError: For other unexpected library errors.
    """
    from dorsal.session import get_shared_dorsal_client
    from dorsal.client import DorsalClient

    effective_client: DorsalClient
    log_message_context = "using shared client"
    if api_key:
        log_message_context = "using temporary client with provided API key"
        effective_client = DorsalClient(api_key=api_key)
    else:
        effective_client = get_shared_dorsal_client()

    logger.debug(
        "Dispatching make_collection_private to client (%s) for ID: '%s'",
        log_message_context,
        collection_id,
    )

    try:
        response = effective_client.make_collection_private(collection_id=collection_id)
        logger.info("Successfully made collection '%s' private.", collection_id)
        return response

    except DorsalError as err:
        logger.warning(
            "A client error occurred during make_collection_private (%s): %s",
            log_message_context,
            err,
        )
        raise
    except Exception as err:
        logger.exception(
            "An unexpected error occurred during make_collection_private (%s).",
            log_message_context,
        )
        raise DorsalError(
            f"An unexpected error occurred while making collection '{collection_id}' private: {err}"
        ) from err

make_collection_public

make_collection_public(collection_id, api_key=None)

Makes a private collection public.

This is a high-level wrapper that handles client instantiation and error logging.

Parameters:

Name	Type	Description	Default
`collection_id`	`str`	The unique ID of the collection to make public.	required
`api_key`	`str`	An API key for this request.	`None`

Returns:

Name	Type	Description
`CollectionWebLocationResponse`	`CollectionWebLocationResponse`	An object containing the new public web URL.

Raises:

Type	Description
`DorsalClientError`	For API errors (e.g., ConflictError if already public).
`DorsalError`	For other unexpected library errors.

Source code in venv/lib/python3.13/site-packages/dorsal/api/collection.py

def make_collection_public(
    collection_id: str,
    api_key: str | None = None,
) -> "CollectionWebLocationResponse":
    """
    Makes a private collection public.

    This is a high-level wrapper that handles client instantiation and error logging.

    Args:
        collection_id (str): The unique ID of the collection to make public.
        api_key (str, optional): An API key for this request.

    Returns:
        CollectionWebLocationResponse: An object containing the new public web URL.

    Raises:
        DorsalClientError: For API errors (e.g., ConflictError if already public).
        DorsalError: For other unexpected library errors.
    """
    from dorsal.session import get_shared_dorsal_client
    from dorsal.client import DorsalClient

    effective_client: DorsalClient
    log_message_context = "using shared client"
    if api_key:
        log_message_context = "using temporary client with provided API key"
        effective_client = DorsalClient(api_key=api_key)
    else:
        effective_client = get_shared_dorsal_client()

    logger.debug(
        "Dispatching make_collection_public to client (%s) for ID: '%s'",
        log_message_context,
        collection_id,
    )

    try:
        response = effective_client.make_collection_public(collection_id=collection_id)
        logger.info("Successfully made collection '%s' public.", collection_id)
        return response

    except DorsalError as err:
        logger.warning(
            "A client error occurred during make_collection_public (%s): %s",
            log_message_context,
            err,
        )
        raise
    except Exception as err:
        logger.exception(
            "An unexpected error occurred during make_collection_public (%s).",
            log_message_context,
        )
        raise DorsalError(
            f"An unexpected error occurred while making collection '{collection_id}' public: {err}"
        ) from err

make_schema_validator

make_schema_validator(
    dataset_id, api_key=None, client=None
)

Fetches a dataset's schema and returns a callable validator function.

This is useful for validating records client-side before attempting to insert them, which can save API calls.

Example

from dorsal.api import make_schema_validator

validator = make_schema_validator("my-org/my-book-collection")

good_record = {"author": "J.R.R. Tolkien", "title": "The Hobbit"}
bad_record = {"author": "J.R.R. Tolkien"} # Missing title

try:
    validator(good_record)
    print("Good record is valid.")
    validator(bad_record)
except Exception as e:
    print(f"Bad record is invalid: {e}")

Parameters:

Name	Type	Description	Default
`dataset_id`	`str`	Identifier for the dataset whose schema will be used.	required
`api_key`	`str`	An API key for this request.	`None`

Returns:

Name	Type	Description
`JsonSchemaValidator`	`JsonSchemaValidator`	A callable instance that validates a dictionary record against the fetched schema.

Raises:

Type	Description
`ValueError`	If `dataset_id` is invalid (propagated from `get_dataset`).
`DorsalClientError`	Base client error or for unexpected issues (propagated from `get_dataset`).
`AuthError`	Authentication failure (propagated from `get_dataset`).
`NotFoundError`	If the dataset is not found (propagated from `get_dataset`).
`ForbiddenError`	Access to the dataset is denied (propagated from `get_dataset`).
`RateLimitError`	If the request is rate-limited by the API (propagated from `get_dataset`).
`NetworkError`	If a network issue occurs (propagated from `get_dataset`).
`APIError`	For other HTTP errors from the API (propagated from `get_dataset`).
`ApiDataValidationError`	If the API response for the dataset is malformed and cannot be parsed into a valid `Dataset` object (propagated from `get_dataset`).
`JSONSchemaSchemaError`	If the schema is invalid.

Source code in venv/lib/python3.13/site-packages/dorsal/api/dataset.py

def make_schema_validator(
    dataset_id: str, api_key: str | None = None, client: DorsalClient | None = None
) -> JsonSchemaValidator:
    """Fetches a dataset's schema and returns a callable validator function.

    This is useful for validating records client-side before attempting to
    insert them, which can save API calls.

    Example:
        ```python
        from dorsal.api import make_schema_validator

        validator = make_schema_validator("my-org/my-book-collection")

        good_record = {"author": "J.R.R. Tolkien", "title": "The Hobbit"}
        bad_record = {"author": "J.R.R. Tolkien"} # Missing title

        try:
            validator(good_record)
            print("Good record is valid.")
            validator(bad_record)
        except Exception as e:
            print(f"Bad record is invalid: {e}")

        ```

    Args:
        dataset_id (str): Identifier for the dataset whose schema will be used.
        api_key (str, optional): An API key for this request.

    Returns:
        JsonSchemaValidator: A callable instance that validates a dictionary
            record against the fetched schema.

    Raises:
        ValueError: If `dataset_id` is invalid (propagated from `get_dataset`).
        DorsalClientError: Base client error or for unexpected issues (propagated
                           from `get_dataset`).
        AuthError: Authentication failure (propagated from `get_dataset`).
        NotFoundError: If the dataset is not found (propagated from `get_dataset`).
        ForbiddenError: Access to the dataset is denied (propagated from `get_dataset`).
        RateLimitError: If the request is rate-limited by the API (propagated from
                        `get_dataset`).
        NetworkError: If a network issue occurs (propagated from `get_dataset`).
        APIError: For other HTTP errors from the API (propagated from `get_dataset`).
        ApiDataValidationError: If the API response for the dataset is malformed and
                                cannot be parsed into a valid `Dataset` object
                                (propagated from `get_dataset`).
        JSONSchemaSchemaError: If the schema is invalid.

    """
    if dataset_id.startswith("open/"):
        schema_name = dataset_id.removeprefix("open/")
        try:
            return get_open_schema_validator(cast(Any, schema_name))
        except (ValueError, TypeError):
            pass

    if is_offline_mode():
        raise DorsalOfflineError(
            f"Cannot fetch validator for '{dataset_id}': System is in OFFLINE mode and this schema is locally available."
        )
    schema = get_dataset_schema(dataset_id=dataset_id, api_key=api_key, client=client)

    return get_json_schema_validator(schema=schema)

register_model

register_model(
    annotation_model,
    schema_id,
    validation_model=None,
    dependencies=None,
    options=None,
    overwrite=False,
    *,
    scope="project"
)

Programmatically registers a new annotation model in the dorsal config.

Source code in venv/lib/python3.13/site-packages/dorsal/api/config.py

def register_model(
    annotation_model: Type[AnnotationModel],
    schema_id: str,
    validation_model: dict | Type[Any] | JsonSchemaValidator | None = None,
    dependencies: list[ModelRunnerDependencyConfig] | ModelRunnerDependencyConfig | None = None,
    options: dict | None = None,
    overwrite: bool = False,
    *,
    scope: Literal["project", "global"] = "project",
) -> None:
    """
    Programmatically registers a new annotation model in the dorsal config.
    """
    from dorsal.common.model import is_pydantic_model_class, is_pydantic_model_instance
    from dorsal.common.validators.json_schema import (
        JsonSchemaValidator,
        JSON_SCHEMA_LIVENESS_KEYWORDS,
    )
    from dorsal.common.exceptions import DorsalConfigError, PydanticValidationError

    if scope not in ["project", "global"]:
        raise ValueError("Invalid scope. Must be one of 'project' or 'global'.")

    effective_dependencies_dicts = []
    if dependencies:
        dep_list = dependencies
        if not isinstance(dep_list, list):
            dep_list = [dep_list]
        for i, dep in enumerate(dep_list):
            d_any = cast(Any, dep)
            if is_pydantic_model_instance(d_any):
                effective_dependencies_dicts.append(d_any.model_dump())
            elif isinstance(d_any, dict):
                raise TypeError(
                    f"Item {i} in 'dependencies' is a dict. "
                    "Dependencies must be passed as instances of a "
                    "'ModelRunnerDependencyConfig' subclass (e.g., MediaTypeDependencyConfig)."
                )
            else:
                raise TypeError(
                    f"Item {i} in 'dependencies' is an invalid type ({type(d_any)}). "
                    "Must be an instance of 'ModelRunnerDependencyConfig'."
                )

    model_module, model_name = annotation_model.__module__, annotation_model.__name__
    if model_module == "__main__":
        raise TypeError(f"Model '{model_name}' must be defined in an importable module, not the main script.")
    model_path = (model_module, model_name)

    validation_model_config: tuple[str, str] | dict[str, Any] | None = None
    is_open_schema = schema_id.startswith("open/")

    if is_open_schema:
        schema_name = schema_id.removeprefix("open/")
        validator_path = (
            "dorsal.file.validators.open_schema",
            f"{schema_name}_validator",
        )
        if validation_model is not None:
            raise ValueError(
                f"Ambiguous configuration: You cannot provide a custom 'validation_model' when using an 'open/' schema_id ('{schema_id}')."
            )
        validation_model_config = validator_path

    elif validation_model is not None:
        if isinstance(validation_model, dict):
            if not any(key in validation_model for key in JSON_SCHEMA_LIVENESS_KEYWORDS):
                raise ValueError("The provided 'validation_model' schema dict appears to be inert.")
            validation_model_config = validation_model
        elif is_pydantic_model_class(validation_model):
            validator_module, validator_name = (
                validation_model.__module__,
                validation_model.__name__,
            )
            if validator_module == "__main__":
                raise TypeError(f"Validator class '{validator_name}' must be defined in an importable module.")
            validation_model_config = (validator_module, validator_name)
        elif isinstance(validation_model, JsonSchemaValidator):
            validator_module, validator_name = (
                validation_model.__module__,
                validation_model.__name__,
            )
            if validator_module == "__main__":
                raise TypeError(f"Validator instance '{validator_name}' must be defined in an importable module.")
            validation_model_config = (validator_module, validator_name)
        else:
            raise TypeError(f"Invalid 'validation_model' type ({type(validation_model)}).")

    new_step_data = {
        k: v
        for k, v in {
            "annotation_model": model_path,
            "schema_id": schema_id,
            "dependencies": effective_dependencies_dicts if effective_dependencies_dicts else None,
            "validation_model": validation_model_config,
            "options": options,
        }.items()
        if v is not None
    }

    try:
        validated_step_model = ModelRunnerPipelineStep.model_validate(new_step_data)
        toml_safe_step_data = validated_step_model.model_dump(mode="json", exclude_none=True)
    except PydanticValidationError as e:
        raise DorsalConfigError(f"The provided model configuration is invalid: {e}") from e

    try:
        PipelineConfig.upsert_step(step_data=toml_safe_step_data, overwrite=overwrite, scope=scope)
    except Exception as e:
        raise DorsalConfigError(f"Failed to register model in {scope} config: {e}") from e

remove_files_from_collection

remove_files_from_collection(
    collection_id, hashes, api_key=None
)

Removes a list of files from a remote collection by their hash.

This function automatically handles batching for large lists of hashes to comply with the API limit (10,000 per request).

Parameters:

Name	Type	Description	Default
`collection_id`	`str`	The unique ID of the collection to modify.	required
`hashes`	`list[str]`	A list of SHA-256 file hashes to remove.	required
`api_key`	`str`	An API key for this request.	`None`

Returns:

Name	Type	Description
`RemoveFilesResponse`	`RemoveFilesResponse`	A consolidated response summarizing the results.

Raises:

Type	Description
`DorsalClientError`	For API errors.
`DorsalError`	For other unexpected library errors.

Source code in venv/lib/python3.13/site-packages/dorsal/api/collection.py

def remove_files_from_collection(
    collection_id: str,
    hashes: list[str],
    api_key: str | None = None,
) -> "RemoveFilesResponse":
    """
    Removes a list of files from a remote collection by their hash.

    This function automatically handles batching for large lists of hashes
    to comply with the API limit (10,000 per request).

    Args:
        collection_id (str): The unique ID of the collection to modify.
        hashes (list[str]): A list of SHA-256 file hashes to remove.
        api_key (str, optional): An API key for this request.

    Returns:
        RemoveFilesResponse: A consolidated response summarizing the results.

    Raises:
        DorsalClientError: For API errors.
        DorsalError: For other unexpected library errors.
    """
    from dorsal.session import get_shared_dorsal_client
    from dorsal.client import DorsalClient
    from dorsal.client.validators import RemoveFilesResponse

    if not hashes:
        raise ValueError("The 'hashes' list cannot be empty.")

    effective_client: DorsalClient
    log_message_context = "using shared client"
    if api_key:
        log_message_context = "using temporary client with provided API key"
        effective_client = DorsalClient(api_key=api_key)
    else:
        effective_client = get_shared_dorsal_client()

    logger.debug(
        "Dispatching remove_files_from_collection to client (%s) for ID: '%s' with %d hashes.",
        log_message_context,
        collection_id,
        len(hashes),
    )

    total_removed = 0
    total_not_found = 0
    batches = [hashes[i : i + API_MAX_BATCH_SIZE] for i in range(0, len(hashes), API_MAX_BATCH_SIZE)]

    try:
        for i, batch in enumerate(batches):
            logger.debug(f"Processing batch {i + 1}/{len(batches)}...")
            response = effective_client.remove_files_from_collection(collection_id=collection_id, hashes=batch)
            total_removed += response.removed_count
            total_not_found += response.not_found_count

        aggregate_response = RemoveFilesResponse(removed_count=total_removed, not_found_count=total_not_found)
        logger.info(
            "Successfully finished removing files from collection '%s'. Removed: %d",
            collection_id,
            aggregate_response.removed_count,
        )
        return aggregate_response

    except DorsalError as err:
        logger.warning(
            "A client error occurred during remove_files_from_collection (%s): %s",
            log_message_context,
            err,
        )
        raise
    except Exception as err:
        logger.exception(
            "An unexpected error occurred during remove_files_from_collection (%s).",
            log_message_context,
        )
        raise DorsalError(
            f"An unexpected error occurred while removing files from collection '{collection_id}': {err}"
        ) from err

remove_model_by_name

remove_model_by_name(name, scope='project')

Removes a model from the pipeline by its name (e.g., "PDFAnnotationModel"). Raises an error if the name is ambiguous (duplicates exist).

Source code in venv/lib/python3.13/site-packages/dorsal/api/config.py

def remove_model_by_name(name: str, scope: Literal["project", "global"] = "project") -> None:
    """
    Removes a model from the pipeline by its name (e.g., "PDFAnnotationModel").
    Raises an error if the name is ambiguous (duplicates exist).
    """
    PipelineConfig.remove_step_by_name(name=name, scope=scope)

remove_tag_from_file

remove_tag_from_file(hash_string, tag_id, api_key=None)

Removes a specific tag from a file record.

Parameters:

Name	Type	Description	Default
`hash_string`	`str`	The hash of the file record.	required
`tag_id`	`str`	The unique ID of the tag to remove.	required
`api_key`	`str`	An API key for this request.	`None`

Source code in venv/lib/python3.13/site-packages/dorsal/api/file.py

def remove_tag_from_file(hash_string: str, tag_id: str, api_key: str | None = None) -> None:
    """
    Removes a specific tag from a file record.

    Args:
        hash_string (str): The hash of the file record.
        tag_id (str): The unique ID of the tag to remove.
        api_key (str, optional): An API key for this request.
    """
    from dorsal.session import get_shared_dorsal_client

    effective_client = get_shared_dorsal_client()
    if api_key:
        from dorsal.client import DorsalClient

        effective_client = DorsalClient(api_key=api_key)

    try:
        effective_client.delete_tag(file_hash=hash_string, tag_id=tag_id)
        return None
    except DorsalClientError:
        raise

scan_directory

scan_directory(
    dir_path,
    recursive=False,
    *,
    api_key=None,
    use_cache=True,
    offline=False
)

Scans a directory and returns a list of LocalFile objects.

This function is a high-level wrapper that processes all files in a given directory and generates their metadata offline using the local Annotation Model pipeline.

It returns a list of LocalFile objects.

Example

from dorsal.api import scan_directory

# Scan a directory non-recursively for all files
processed_files = scan_directory("path/to/my_invoices")

print(f"Found {len(processed_files)} files to process.")

# You can now iterate over the list
for f in processed_files:
    if f.size > 500000:
        print(f"{f.name} is a large file.")

Parameters:

Name	Type	Description	Default
`dir_path`	`str`	The path to the directory you want to scan.	required
`recursive`	`bool`	If True, scans all subdirectories recursively. Defaults to False.	`False`

Returns:

Type	Description
`list[LocalFile]`	list[LocalFile]: A list of processed `LocalFile` objects from the directory.

Source code in venv/lib/python3.13/site-packages/dorsal/api/file.py

def scan_directory(
    dir_path: str, recursive: bool = False, *, api_key: str | None = None, use_cache: bool = True, offline: bool = False
) -> list[LocalFile]:
    """Scans a directory and returns a list of LocalFile objects.

    This function is a high-level wrapper that processes all files in a
    given directory and generates their metadata offline using the local Annotation Model pipeline.

    It returns a list of `LocalFile` objects.

    Example:
        ```python
        from dorsal.api import scan_directory

        # Scan a directory non-recursively for all files
        processed_files = scan_directory("path/to/my_invoices")

        print(f"Found {len(processed_files)} files to process.")

        # You can now iterate over the list
        for f in processed_files:
            if f.size > 500000:
                print(f"{f.name} is a large file.")
        ```

    Args:
        dir_path (str): The path to the directory you want to scan.
        recursive (bool, optional): If True, scans all subdirectories
            recursively. Defaults to False.

    Returns:
        list[LocalFile]: A list of processed `LocalFile` objects from the directory.
    """
    from dorsal.file.metadata_reader import MetadataReader

    effective_reader: MetadataReader
    if api_key is not None or offline:
        if api_key:
            logger.debug(
                "API key override for scan_directory (path: '%s'). Attaching MetadataReader configured with this API Key",
                dir_path,
            )
        if offline:
            logger.debug(
                "Offline mode. Attached MetadataReader blocked from making network calls",
            )
        effective_reader = MetadataReader(api_key=api_key)
    else:
        logger.debug(
            "No API key override for scan_directory (file: '%s'). Using shared METADATA_READER.",
            dir_path,
        )
        effective_reader = get_metadata_reader()

    logger.debug(
        "High-level scan_directory calling effective MetadataReader for dir_path='%s', recursive=%s.",
        dir_path,
        recursive,
    )

    try:
        local_files = effective_reader.scan_directory(dir_path=dir_path, recursive=recursive, skip_cache=not use_cache)
        logger.debug(
            "Effective MetadataReader.scan_directory completed for dir_path='%s'. Found %d LocalFile objects.",
            dir_path,
            len(local_files),
        )
        return local_files
    except (FileNotFoundError, DorsalError) as err:
        logger.warning(
            "Call to effective MetadataReader.scan_directory for dir_path='%s' failed: %s - %s",
            dir_path,
            type(err).__name__,
            err,
        )
        raise
    except Exception as err:
        logger.exception(
            "Unexpected error in high-level scan_directory for dir_path='%s'",
            dir_path,
        )
        if isinstance(err, DorsalError):
            raise
        raise DorsalError(f"An unexpected error occurred while reading directory '{dir_path}'.") from err

scan_file

scan_file(
    file_path,
    *,
    api_key=None,
    use_cache=True,
    offline=False
)

Processes a single file and returns a LocalFile object.

This is a direct wrapper for dorsal.LocalFile. It's a convenient entry point for processing a single file and accessing its metadata without needing to import the LocalFile class directly.

Example

from dorsal.api import scan_file

local_file = scan_file("path/to/my_image.jpg")

print(f"File: {local_file.name}")
print(f"Media Type: {local_file.media_type}")

Parameters:

Name	Type	Description	Default
`file_path`	`str`	The path to the local file to process.	required

Returns:

Name	Type	Description
`LocalFile`	`LocalFile`	An initialized `LocalFile` instance with extracted metadata."

Source code in venv/lib/python3.13/site-packages/dorsal/api/file.py

def scan_file(
    file_path: str, *, api_key: str | None = None, use_cache: bool = True, offline: bool = False
) -> LocalFile:
    """Processes a single file and returns a LocalFile object.

    This is a direct wrapper for `dorsal.LocalFile`. It's a convenient
    entry point for processing a single file and accessing its metadata
    without needing to import the `LocalFile` class directly.

    Example:
        ```python
        from dorsal.api import scan_file

        local_file = scan_file("path/to/my_image.jpg")

        print(f"File: {local_file.name}")
        print(f"Media Type: {local_file.media_type}")
        ```

    Args:
        file_path (str): The path to the local file to process.

    Returns:
        LocalFile: An initialized `LocalFile` instance with extracted metadata."
    """
    from dorsal.file.metadata_reader import MetadataReader

    effective_reader: MetadataReader

    if api_key is not None or offline:
        if api_key:
            logger.debug(
                "API key override for scan_file (file: '%s'). Attaching MetadataReader configured with this API Key",
                file_path,
            )
        if offline:
            logger.debug(
                "Offline mode. Attached MetadataReader blocked from making network calls",
            )
        effective_reader = MetadataReader(api_key=api_key)
    else:
        logger.debug(
            "No API key override for scan_file (file: '%s'). Using shared METADATA_READER.",
            file_path,
        )
        effective_reader = get_metadata_reader()

    logger.debug("High-level scan_file calling effective MetadataReader for file_path='%s'.", file_path)

    try:
        local_file = effective_reader.scan_file(file_path=file_path, skip_cache=not use_cache)
        logger.debug(
            "Effective MetadataReader.scan_file completed for file_path='%s'. Hash: %s",
            file_path,
            local_file.hash if hasattr(local_file, "hash") else "N/A",
        )
        return local_file
    except (FileNotFoundError, IOError, DorsalError) as err:
        logger.warning(
            "Call to effective MetadataReader.scan_file for file_path='%s' failed: %s - %s",
            file_path,
            type(err).__name__,
            err,
        )
        raise
    except Exception as err:
        logger.exception(
            "Unexpected error in high-level scan_file for file_path='%s'",
            file_path,
        )
        if isinstance(err, DorsalError):
            raise
        raise DorsalError(f"An unexpected error occurred while reading file '{file_path}'.") from err

search_global_files

search_global_files(
    query: str,
    page: int = 1,
    per_page: int = 25,
    sort_by: Literal[
        "date_modified", "date_created", "size", "name"
    ] = "date_modified",
    sort_order: Literal["asc", "desc"] = "desc",
    api_key: str | None = None,
    *,
    match_any: bool = False,
    mode: Literal["pydantic"],
    model_dump_exclude_none: bool = True,
    model_dump_by_alias: bool = True
) -> "FileSearchResponse"

search_global_files(
    query: str,
    page: int = 1,
    per_page: int = 25,
    sort_by: Literal[
        "date_modified", "date_created", "size", "name"
    ] = "date_modified",
    sort_order: Literal["asc", "desc"] = "desc",
    api_key: str | None = None,
    *,
    match_any: bool = False,
    mode: Literal["dict"],
    model_dump_exclude_none: bool = True,
    model_dump_by_alias: bool = True
) -> dict[str, Any]

search_global_files(
    query: str,
    page: int = 1,
    per_page: int = 25,
    sort_by: Literal[
        "date_modified", "date_created", "size", "name"
    ] = "date_modified",
    sort_order: Literal["asc", "desc"] = "desc",
    api_key: str | None = None,
    *,
    match_any: bool = False,
    mode: Literal["json"],
    model_dump_exclude_none: bool = True,
    model_dump_by_alias: bool = True
) -> str

search_global_files(
    query,
    page=1,
    per_page=25,
    sort_by="date_modified",
    sort_order="desc",
    api_key=None,
    *,
    match_any=False,
    mode="pydantic",
    model_dump_exclude_none=True,
    model_dump_by_alias=True
)

Searches for public file records across the entire DorsalHub platform.

This function provides a simple interface to search all public files. Note: This is a premium feature and requires an appropriate account status.

Example

from dorsal.api import search_global_files

# Find all publicly indexed files tagged with 'research'
try:
    response = search_global_files(
        query="tag:research",
        mode="dict"
    )

    print(f"Found {response['pagination']['record_count']} public files tagged 'research'.")
    for record in response['results']:
        print(f"- {record['name']} (hash: {record['hash']})")

except Exception as e:
    print(f"An error occurred during search: {e}")

Parameters:

Name	Type	Description	Default
`query`	`str`	The search query string. Supports operators like `tag:`, `name:`, `extension:`, and `size:>1MB`.	required
`mode`	`Literal['pydantic', 'dict', 'json']`	The desired return format. Defaults to "pydantic".	`'pydantic'`
`page`	`int`	The page number for pagination. Defaults to 1.	`1`
`per_page`	`int`	The number of results per page. Must be between 1 and 50. Defaults to 25.	`25`
`sort_by`	`Literal`	The field to sort results by. Defaults to 'date_modified'.	`'date_modified'`
`sort_order`	`Literal`	The sort order ('asc' or 'desc'). Defaults to 'desc'.	`'desc'`
`api_key`	`str \| None`	An API key for this request, overriding the client's default. Defaults to None.	`None`

Returns:

Type	Description
`'FileSearchResponse \| dict \| str'`	Union[FileSearchResponse, dict, str]: The search results, formatted according to the specified `mode`.

Raises:

Type	Description
`DorsalClientError`	For client-side validation errors or API errors like authentication, rate limiting, or insufficient permissions (e.g. using this feature on a non-premium account).
`DorsalError`	For other unexpected library errors.

Source code in venv/lib/python3.13/site-packages/dorsal/api/file.py

def search_global_files(
    query: str,
    page: int = 1,
    per_page: int = 25,
    sort_by: Literal["date_modified", "date_created", "size", "name"] = "date_modified",
    sort_order: Literal["asc", "desc"] = "desc",
    api_key: str | None = None,
    *,
    match_any: bool = False,
    mode: Literal["pydantic", "dict", "json"] = "pydantic",
    model_dump_exclude_none: bool = True,
    model_dump_by_alias: bool = True,
) -> "FileSearchResponse | dict | str":
    """Searches for public file records across the entire DorsalHub platform.

    This function provides a simple interface to search all public files.
    Note: This is a premium feature and requires an appropriate account status.

    Example:
        ```python
        from dorsal.api import search_global_files

        # Find all publicly indexed files tagged with 'research'
        try:
            response = search_global_files(
                query="tag:research",
                mode="dict"
            )

            print(f"Found {response['pagination']['record_count']} public files tagged 'research'.")
            for record in response['results']:
                print(f"- {record['name']} (hash: {record['hash']})")

        except Exception as e:
            print(f"An error occurred during search: {e}")
        ```

    Args:
        query (str): The search query string. Supports operators like
            `tag:`, `name:`, `extension:`, and `size:>1MB`.
        mode (Literal["pydantic", "dict", "json"]): The desired return format.
            Defaults to "pydantic".
        page (int): The page number for pagination. Defaults to 1.
        per_page (int): The number of results per page. Must be between 1 and 50.
            Defaults to 25.
        sort_by (Literal): The field to sort results by. Defaults to 'date_modified'.
        sort_order (Literal): The sort order ('asc' or 'desc'). Defaults to 'desc'.
        api_key (str | None): An API key for this request, overriding the
            client's default. Defaults to None.

    Returns:
        Union[FileSearchResponse, dict, str]: The search results, formatted
            according to the specified `mode`.

    Raises:
        DorsalClientError: For client-side validation errors or API errors like
            authentication, rate limiting, or insufficient permissions (e.g.
            using this feature on a non-premium account).
        DorsalError: For other unexpected library errors.
    """
    from dorsal.client import DorsalClient

    effective_client: DorsalClient
    log_message_context = ""
    if api_key:
        log_message_context = "using temporary client with provided API key"
        logger.debug("API key override provided for global search. Creating temporary DorsalClient.")
        effective_client = DorsalClient(api_key=api_key)
    else:
        log_message_context = "using shared client via MetadataReader"
        logger.debug("No API key override for global search. Using shared client instance.")
        effective_client = get_metadata_reader()._client

    logger.debug(
        "Dispatching global file search to client (%s) with query: '%s'",
        log_message_context,
        query,
    )

    try:
        response = effective_client.search_files(
            q=query,
            scope="global",
            page=page,
            per_page=per_page,
            sort_by=sort_by,
            sort_order=sort_order,
            match_any=match_any,
        )

        logger.debug(
            "Global file search successful. Returned page %d of %d, with %d records.",
            response.pagination.current_page,
            response.pagination.page_count,
            len(response.results),
        )

        if mode == "pydantic":
            return response
        if mode == "dict":
            return response.model_dump(
                by_alias=model_dump_by_alias,
                exclude_none=model_dump_exclude_none,
                mode="json",
            )
        if mode == "json":
            return response.model_dump_json(
                by_alias=model_dump_by_alias,
                exclude_none=model_dump_exclude_none,
                indent=2,
            )
        raise ValueError(f"Invalid mode: '{mode}'. Must be one of 'pydantic', 'dict', or 'json'.")  # pragma: no cover

    except DorsalError as err:
        logger.warning(
            "A client error occurred during search_global_files (%s): %s - %s",
            log_message_context,
            type(err).__name__,
            err,
        )
        raise
    except Exception as err:
        logger.exception(
            "An unexpected error occurred during search_global_files (%s).",
            log_message_context,
        )
        raise DorsalError(f"An unexpected error occurred during global file search: {err}") from err

search_user_files

search_user_files(
    query: str,
    deduplicate: bool = True,
    page: int = 1,
    per_page: int = 25,
    sort_by: Literal[
        "date_modified", "date_created", "size", "name"
    ] = "date_modified",
    sort_order: Literal["asc", "desc"] = "desc",
    *,
    match_any: bool = False,
    api_key: str | None = None,
    mode: Literal["pydantic"],
    model_dump_exclude_none: bool = True,
    model_dump_by_alias: bool = True
) -> "FileSearchResponse"

search_user_files(
    query: str,
    deduplicate: bool = True,
    page: int = 1,
    per_page: int = 25,
    sort_by: Literal[
        "date_modified", "date_created", "size", "name"
    ] = "date_modified",
    sort_order: Literal["asc", "desc"] = "desc",
    *,
    match_any: bool = False,
    api_key: str | None = None,
    mode: Literal["dict"],
    model_dump_exclude_none: bool = True,
    model_dump_by_alias: bool = True
) -> dict[str, Any]

search_user_files(
    query: str,
    deduplicate: bool = True,
    page: int = 1,
    per_page: int = 25,
    sort_by: Literal[
        "date_modified", "date_created", "size", "name"
    ] = "date_modified",
    sort_order: Literal["asc", "desc"] = "desc",
    *,
    match_any: bool = False,
    api_key: str | None = None,
    mode: Literal["json"],
    model_dump_exclude_none: bool = True,
    model_dump_by_alias: bool = True
) -> str

search_user_files(
    query,
    deduplicate=True,
    page=1,
    per_page=25,
    sort_by="date_modified",
    sort_order="desc",
    *,
    match_any=False,
    api_key=None,
    mode="pydantic",
    model_dump_exclude_none=True,
    model_dump_by_alias=True
)

Searches for file records indexed by the authenticated user.

This function provides a simple and powerful interface to search for files you have indexed on DorsalHub. The query supports simple text matching as well as advanced operators.

Example

from dorsal.api import search_user_files

# Find all PDF files you have indexed, sorted by name
try:
    response = search_user_files(
        query="extension:pdf",
        sort_by="name",
        sort_order="asc",
        mode="dict"
    )

    print(f"Found {response['pagination']['record_count']} matching PDF files.")
    for record in response['results']:
        print(f"- {record['name']}")

except Exception as e:
    print(f"An error occurred during search: {e}")

Parameters:

Name	Type	Description	Default
`query`	`str`	The search query string. Supports operators like `tag:`, `name:`, `extension:`, and `size:>1MB`.	required
`mode`	`Literal['pydantic', 'dict', 'json']`	The desired return format. Defaults to "pydantic".	`'pydantic'`
`deduplicate`	`bool`	If True, returns only unique file records based on their content hash. Defaults to True.	`True`
`page`	`int`	The page number for pagination. Defaults to 1.	`1`
`per_page`	`int`	The number of results per page. Must be between 1 and 50. Defaults to 25.	`25`
`sort_by`	`Literal`	The field to sort results by. Defaults to 'date_modified'.	`'date_modified'`
`sort_order`	`Literal`	The sort order ('asc' or 'desc'). Defaults to 'desc'.	`'desc'`
`api_key`	`str \| None`	An API key for this request, overriding the client's default. Defaults to None.	`None`

Returns:

Type	Description
`'FileSearchResponse \| dict \| str'`	Union[FileSearchResponse, dict, str]: The search results, formatted according to the specified `mode`.

Raises:

Type	Description
`DorsalClientError`	For client-side validation errors or API errors like authentication or rate limiting.
`DorsalError`	For other unexpected library errors.

Source code in venv/lib/python3.13/site-packages/dorsal/api/file.py

def search_user_files(
    query: str,
    deduplicate: bool = True,
    page: int = 1,
    per_page: int = 25,
    sort_by: Literal["date_modified", "date_created", "size", "name"] = "date_modified",
    sort_order: Literal["asc", "desc"] = "desc",
    *,
    match_any: bool = False,
    api_key: str | None = None,
    mode: Literal["pydantic", "dict", "json"] = "pydantic",
    model_dump_exclude_none: bool = True,
    model_dump_by_alias: bool = True,
) -> "FileSearchResponse | dict | str":
    """Searches for file records indexed by the authenticated user.

    This function provides a simple and powerful interface to search for files
    you have indexed on DorsalHub. The query supports simple text matching as
    well as advanced operators.

    Example:
        ```python
        from dorsal.api import search_user_files

        # Find all PDF files you have indexed, sorted by name
        try:
            response = search_user_files(
                query="extension:pdf",
                sort_by="name",
                sort_order="asc",
                mode="dict"
            )

            print(f"Found {response['pagination']['record_count']} matching PDF files.")
            for record in response['results']:
                print(f"- {record['name']}")

        except Exception as e:
            print(f"An error occurred during search: {e}")
        ```

    Args:
        query (str): The search query string. Supports operators like
            `tag:`, `name:`, `extension:`, and `size:>1MB`.
        mode (Literal["pydantic", "dict", "json"]): The desired return format.
            Defaults to "pydantic".
        deduplicate (bool): If True, returns only unique file records based on
            their content hash. Defaults to True.
        page (int): The page number for pagination. Defaults to 1.
        per_page (int): The number of results per page. Must be between 1 and 50.
            Defaults to 25.
        sort_by (Literal): The field to sort results by. Defaults to 'date_modified'.
        sort_order (Literal): The sort order ('asc' or 'desc'). Defaults to 'desc'.
        api_key (str | None): An API key for this request, overriding the
            client's default. Defaults to None.

    Returns:
        Union[FileSearchResponse, dict, str]: The search results, formatted
            according to the specified `mode`.

    Raises:
        DorsalClientError: For client-side validation errors or API errors
            like authentication or rate limiting.
        DorsalError: For other unexpected library errors.
    """
    from dorsal.client import DorsalClient

    effective_client: DorsalClient
    log_message_context = ""
    if api_key:
        log_message_context = "using temporary client with provided API key"
        logger.debug("API key override provided for search. Creating temporary DorsalClient.")
        effective_client = DorsalClient(api_key=api_key)
    else:
        log_message_context = "using shared client via MetadataReader"
        logger.debug("No API key override for search. Using shared client instance.")
        effective_client = get_metadata_reader()._client

    logger.debug(
        "Dispatching user file search to client (%s) with query: '%s'",
        log_message_context,
        query,
    )

    try:
        response = effective_client.search_files(
            q=query,
            scope="user",
            deduplicate=deduplicate,
            page=page,
            per_page=per_page,
            sort_by=sort_by,
            sort_order=sort_order,
            match_any=match_any,
        )

        logger.debug(
            "User file search successful. Returned page %d of %d, with %d records.",
            response.pagination.current_page,
            response.pagination.page_count,
            len(response.results),
        )

        if mode == "pydantic":
            return response
        if mode == "dict":
            return response.model_dump(
                by_alias=model_dump_by_alias,
                exclude_none=model_dump_exclude_none,
                mode="json",
            )
        if mode == "json":
            return response.model_dump_json(
                by_alias=model_dump_by_alias,
                exclude_none=model_dump_exclude_none,
                indent=2,
            )

        raise ValueError(f"Invalid mode: '{mode}'. Must be one of 'pydantic', 'dict', or 'json'.")  # pragma: no cover

    except DorsalError as err:
        logger.warning(
            "A client error occurred during search_user_files (%s): %s - %s",
            log_message_context,
            type(err).__name__,
            err,
        )
        raise
    except Exception as err:
        logger.exception(
            "An unexpected error occurred during search_user_files (%s).",
            log_message_context,
        )
        raise DorsalError(f"An unexpected error occurred during file search: {err}") from err

show_model_pipeline

show_model_pipeline(scope='effective')

Returns a simplified, human-readable summary of the pipeline.

Source code in venv/lib/python3.13/site-packages/dorsal/api/config.py

def show_model_pipeline(scope: Literal["effective", "project", "global"] = "effective") -> list[dict[str, Any]]:
    """
    Returns a simplified, human-readable summary of the pipeline.
    """
    steps = PipelineConfig.get_steps(scope=scope)
    summary = []
    for i, step in enumerate(steps):
        deps_str = "None"
        if step.dependencies:
            deps = [d.type for d in step.dependencies]
            deps_str = ", ".join(deps)

        status = "Active"
        if step.deactivated:
            status = "Deactivated"
        if i == 0:
            status = "Base (Locked)"

        summary.append(
            {
                "index": i,
                "status": status,
                "name": step.annotation_model.name,
                "module": step.annotation_model.module,
                "schema_id": step.schema_id,
                "dependencies": deps_str,
            }
        )
    return summary

update_collection

update_collection(
    collection_id: str,
    name: str | None = None,
    description: str | None = None,
    api_key: str | None = None,
    *,
    mode: Literal["pydantic"]
) -> FileCollection

update_collection(
    collection_id: str,
    name: str | None = None,
    description: str | None = None,
    api_key: str | None = None,
    *,
    mode: Literal["dict"]
) -> dict[str, Any]

update_collection(
    collection_id: str,
    name: str | None = None,
    description: str | None = None,
    api_key: str | None = None,
    *,
    mode: Literal["json"]
) -> str

update_collection(
    collection_id,
    name=None,
    description=None,
    api_key=None,
    *,
    mode="pydantic"
)

Updates the metadata of a remote collection on DorsalHub.

Parameters:

Name	Type	Description	Default
`collection_id`	`str`	The unique ID of the collection to update.	required
`name`	`str`	The new name for the collection.	`None`
`description`	`str`	The new description for the collection.	`None`
`api_key`	`str`	An API key for this request.	`None`
`mode`	`Literal['pydantic', 'dict', 'json']`	The desired return format.	`'pydantic'`

Returns:

Type	Description
`FileCollection \| dict[str, Any] \| str`	The updated collection data, formatted according to the specified `mode`.

Raises:

Type	Description
`DorsalClientError`	For API errors.
`DorsalError`	For other unexpected library errors.

Source code in venv/lib/python3.13/site-packages/dorsal/api/collection.py

def update_collection(
    collection_id: str,
    name: str | None = None,
    description: str | None = None,
    api_key: str | None = None,
    *,
    mode: Literal["pydantic", "dict", "json"] = "pydantic",
) -> "FileCollection | dict[str, Any] | str":
    """
    Updates the metadata of a remote collection on DorsalHub.

    Args:
        collection_id (str): The unique ID of the collection to update.
        name (str, optional): The new name for the collection.
        description (str, optional): The new description for the collection.
        api_key (str, optional): An API key for this request.
        mode (Literal["pydantic", "dict", "json"]): The desired return format.

    Returns:
        The updated collection data, formatted according to the specified `mode`.

    Raises:
        DorsalClientError: For API errors.
        DorsalError: For other unexpected library errors.
    """
    from dorsal.session import get_shared_dorsal_client
    from dorsal.client import DorsalClient

    if not any([name, description]):
        raise ValueError("At least one field (name or description) must be provided to update.")

    effective_client: DorsalClient

    if api_key:
        log_message_context = "using temporary client with provided API key"
        effective_client = DorsalClient(api_key=api_key)
    else:
        log_message_context = "using shared client"
        effective_client = get_shared_dorsal_client()

    logger.debug(
        "Dispatching update_collection to client (%s) for ID: '%s'",
        log_message_context,
        collection_id,
    )

    try:
        response = effective_client.update_collection(collection_id=collection_id, name=name, description=description)

        if mode == "pydantic":
            return response
        if mode == "dict":
            return response.model_dump(mode="json", by_alias=True, exclude_none=True)
        if mode == "json":
            return response.model_dump_json(indent=2, by_alias=True, exclude_none=True)

        raise ValueError(f"Invalid mode: '{mode}'.")

    except DorsalError as err:
        logger.warning(
            "A client error occurred during update_collection (%s): %s",
            log_message_context,
            err,
        )
        raise
    except Exception as err:
        logger.exception(
            "An unexpected error occurred during update_collection (%s).",
            log_message_context,
        )
        raise DorsalError(f"An unexpected error occurred while updating collection '{collection_id}': {err}") from err

validate_dataset_records

validate_dataset_records(
    dataset_id,
    records,
    schema_dict=None,
    api_key=None,
    client=None,
)

Validates records against a dataset's JSON schema.

Orchestrates schema retrieval (if a schema is not provided directly), custom validator preparation, and record-by-record validation. Returns a summary of the validation results.

Parameters:

Name	Type	Description	Default
`dataset_id`	`str`	Identifier of the dataset. Used to fetch the schema if `schema_dict` is None, and for logging/error context.	required
`records`	`list[dict]`	A list of dictionaries, where each dictionary is a record to be validated.	required
`schema_dict`	`dict \| None`	Optional. A pre-fetched JSON schema dictionary. If provided, `get_dataset_schema` will not be called.	`None`
`api_key`	`str \| None`	Optional API key, used by `get_dataset_schema` if `schema_dict` is not provided. Uses the client's default if None.	`None`

Returns:

Name	Type	Description
`dict`	`dict`	A summary of validation results, including counts for total, valid, and invalid records, and detailed error information for each invalid record.

Raises:

Type	Description
`ValueError`	If `dataset_id` is invalid, `records` is not a list, or if `schema_dict` is provided but is not a valid, non-empty dictionary.
`ApiDataValidationError`	If a schema (fetched or provided) is invalid or cannot be used to prepare a validator (e.g., due to `JSONSchemaSchemaError` during preparation).
`DorsalClientError`	(And its subclasses like AuthError, NotFoundError, NetworkError, APIError, etc.) Propagated if `get_dataset_schema` is called and encounters an issue.
`JSONSchemaSchemaError`	Propagated from `validate_records_with_validator` if the validator's schema has issues found during the record validation loop (should be rare if `prepare_custom_validator` succeeds).

Source code in venv/lib/python3.13/site-packages/dorsal/api/dataset.py

def validate_dataset_records(
    dataset_id: str,
    records: list[dict],
    schema_dict: dict | None = None,
    api_key: str | None = None,
    client: DorsalClient | None = None,
) -> dict:
    """Validates records against a dataset's JSON schema.

    Orchestrates schema retrieval (if a schema is not provided directly),  custom validator preparation,
        and record-by-record validation.
    Returns a summary of the validation results.

    Args:
        dataset_id: Identifier of the dataset. Used to fetch the schema if
                    `schema_dict` is None, and for logging/error context.
        records: A list of dictionaries, where each dictionary is a record
                 to be validated.
        schema_dict: Optional. A pre-fetched JSON schema dictionary. If provided,
                     `get_dataset_schema` will not be called.
        api_key: Optional API key, used by `get_dataset_schema` if `schema_dict` is not
                 provided. Uses the client's default if None.

    Returns:
        dict: A summary of validation results, including counts for total,
              valid, and invalid records, and detailed error information for
              each invalid record.

    Raises:
        ValueError: If `dataset_id` is invalid, `records` is not a list,
                    or if `schema_dict` is provided but is not a valid,
                    non-empty dictionary.
        ApiDataValidationError: If a schema (fetched or provided) is invalid or
                                cannot be used to prepare a validator (e.g., due
                                to `JSONSchemaSchemaError` during preparation).
        DorsalClientError: (And its subclasses like AuthError, NotFoundError,
                           NetworkError, APIError, etc.) Propagated if `get_dataset_schema`
                           is called and encounters an issue.
        JSONSchemaSchemaError: Propagated from `validate_records_with_validator`
                               if the validator's schema has issues found during
                               the record validation loop (should be rare if
                               `prepare_custom_validator` succeeds).
    """
    logger.debug(
        "Initiating record validation for dataset_id: '%s'. %s records. Schema provided: %s. API key: %s. Custom DorsalClient: %s",
        dataset_id,
        (len(records) if isinstance(records, list) else "Invalid 'records' input (not a list)"),
        "Yes" if schema_dict is not None else "No",
        "Yes (user-provided)" if api_key else "No (client default)",
        "Yes" if client else "No",
    )

    if not (isinstance(dataset_id, str) and dataset_id.strip()):
        logger.warning(
            "Dataset ID must be a non-empty string. Got: '%s' (type: %s)",
            dataset_id,
            type(dataset_id).__name__,
        )
        raise ValueError("Dataset ID must be a non-empty string.")

    if not isinstance(records, list):
        logger.warning("Input 'records' must be a list. Got: %s", type(records).__name__)  # type: ignore[unreachable]
        raise ValueError(f"Input 'records' must be a list, got {type(records).__name__}.")  # type: ignore[unreachable]

    actual_schema_to_use: dict
    if schema_dict is not None:
        logger.debug("Using user-provided schema for dataset_id: '%s'.", dataset_id)
        if not isinstance(schema_dict, dict) or not schema_dict:
            logger.warning(
                "Provided schema_dict for dataset_id '%s' must be a non-empty dictionary. Got type: %s",
                dataset_id,
                type(schema_dict).__name__,
            )
            raise ValueError("Provided schema_dict must be a non-empty dictionary.")
        actual_schema_to_use = schema_dict
    else:
        logger.debug("Schema not provided for dataset_id: '%s'; attempting to fetch.", dataset_id)
        try:
            actual_schema_to_use = get_dataset_schema(dataset_id=dataset_id.strip(), api_key=api_key, client=client)
            logger.debug("Successfully fetched schema for dataset_id: '%s'.", dataset_id)
        except Exception as err:
            logger.warning(
                "Failed to fetch schema for dataset_id '%s' (needed for validation): %s - %s",
                dataset_id,
                type(err).__name__,
                err,
            )
            raise

    try:
        validator = get_json_schema_validator(schema=actual_schema_to_use)
        logger.debug("Schema validator prepared successfully for dataset_id: '%s'.", dataset_id)
    except (ValueError, ApiDataValidationError) as err:
        logger.warning(
            "Failed to prepare schema validator for dataset_id '%s' using the schema. Error: %s - %s",
            dataset_id,
            type(err).__name__,
            err,
        )
        raise
    except Exception as err:
        logger.exception(
            "Unexpected error preparing schema validator for dataset_id '%s'.",
            dataset_id,
        )
        raise ApiDataValidationError(
            f"Could not prepare validator for dataset_id '{dataset_id}' due to an unexpected error with the schema or validator setup."
        ) from err

    try:
        validation_summary = json_schema_validate_records(records=records, validator=validator)
        logger.debug(
            "Record validation process completed via helper for dataset_id: '%s'.",
            dataset_id,
        )
        return validation_summary
    except (ValueError, JSONSchemaSchemaError) as err:
        logger.warning(
            "Record validation failed for dataset_id '%s' due to issues within the validator or record structure: %s - %s",
            dataset_id,
            type(err).__name__,
            err,
        )
        raise
    except Exception as err:
        logger.exception(
            "Unexpected error during the record validation stage for dataset_id '%s'.",
            dataset_id,
        )
        raise DorsalError(
            f"An unexpected error occurred while validating records for dataset_id '{dataset_id}'."
        ) from err