Skip to content

High Level Functions

These functions provide a high-level interface for generating and managing file records, both locally and on DorsalHub.

dorsal.api

activate_model_by_name

activate_model_by_name(name, scope='project')

Activates (enables) a model in the pipeline by name.

Source code in venv/lib/python3.13/site-packages/dorsal/api/config.py
def activate_model_by_name(name: str, scope: Literal["project", "global"] = "project") -> None:
    """Activates (enables) a model in the pipeline by name."""
    PipelineConfig.set_step_status_by_name(name=name, active=True, scope=scope)

add_files_to_collection

add_files_to_collection(
    collection_id, hashes, api_key=None
)

Adds a list of files to a remote collection by their hash.

This function automatically handles batching for large lists of hashes to comply with the API limit (10,000 per request).

Parameters:

Name Type Description Default
collection_id str

The unique ID of the collection to modify.

required
hashes list[str]

A list of SHA-256 file hashes to add.

required
api_key str

An API key for this request.

None

Returns:

Name Type Description
AddFilesResponse AddFilesResponse

A consolidated response summarizing the results of all batches.

Raises:

Type Description
DorsalClientError

For API errors.

DorsalError

For other unexpected library errors.

Source code in venv/lib/python3.13/site-packages/dorsal/api/collection.py
def add_files_to_collection(
    collection_id: str,
    hashes: list[str],
    api_key: str | None = None,
) -> "AddFilesResponse":
    """
    Adds a list of files to a remote collection by their hash.

    This function automatically handles batching for large lists of hashes
    to comply with the API limit (10,000 per request).

    Args:
        collection_id (str): The unique ID of the collection to modify.
        hashes (list[str]): A list of SHA-256 file hashes to add.
        api_key (str, optional): An API key for this request.

    Returns:
        AddFilesResponse: A consolidated response summarizing the results of all batches.

    Raises:
        DorsalClientError: For API errors.
        DorsalError: For other unexpected library errors.
    """
    from dorsal.session import get_shared_dorsal_client
    from dorsal.client import DorsalClient
    from dorsal.client.validators import AddFilesResponse

    if not hashes:
        raise ValueError("The 'hashes' list cannot be empty.")

    effective_client: DorsalClient
    log_message_context = "using shared client"
    if api_key:
        log_message_context = "using temporary client with provided API key"
        effective_client = DorsalClient(api_key=api_key)
    else:
        effective_client = get_shared_dorsal_client()

    logger.debug(
        "Dispatching add_files_to_collection to client (%s) for ID: '%s' with %d hashes.",
        log_message_context,
        collection_id,
        len(hashes),
    )

    total_added = 0
    total_duplicates = 0
    total_invalid = 0
    batches = [hashes[i : i + API_MAX_BATCH_SIZE] for i in range(0, len(hashes), API_MAX_BATCH_SIZE)]

    try:
        for i, batch in enumerate(batches):
            logger.debug(f"Processing batch {i + 1}/{len(batches)}...")
            response = effective_client.add_files_to_collection(collection_id=collection_id, hashes=batch)
            total_added += response.added_count
            total_duplicates += response.duplicate_count
            total_invalid += response.invalid_count

        aggregate_response = AddFilesResponse(
            added_count=total_added,
            duplicate_count=total_duplicates,
            invalid_count=total_invalid,
        )
        logger.info(
            "Successfully finished adding files to collection '%s'. Added: %d, Duplicates: %d, Invalid: %s",
            collection_id,
            aggregate_response.added_count,
            aggregate_response.duplicate_count,
            aggregate_response.invalid_count,
        )
        return aggregate_response

    except DorsalError as err:
        logger.warning(
            "A client error occurred during add_files_to_collection (%s): %s",
            log_message_context,
            err,
        )
        raise
    except Exception as err:
        logger.exception(
            "An unexpected error occurred during add_files_to_collection (%s).",
            log_message_context,
        )
        raise DorsalError(
            f"An unexpected error occurred while adding files to collection '{collection_id}': {err}"
        ) from err

add_tag_to_file

add_tag_to_file(
    hash_string, name, value, private, api_key=None
)

Adds a single tag to a file record on DorsalHub.

Parameters:

Name Type Description Default
hash_string str

The hash of the file record to tag.

required
name str

The name of the tag.

required
value Any

The value of the tag.

required
private bool

The visibility of the tag itself.

required
api_key str

An API key for this request.

None

Returns:

Name Type Description
FileTagResponse FileTagResponse

A response object from the API.

Source code in venv/lib/python3.13/site-packages/dorsal/api/file.py
def add_tag_to_file(
    hash_string: str, name: str, value: Any, private: bool, api_key: str | None = None
) -> FileTagResponse:
    """
    Adds a single tag to a file record on DorsalHub.

    Args:
        hash_string (str): The hash of the file record to tag.
        name (str): The name of the tag.
        value (Any): The value of the tag.
        private (bool): The visibility of the tag itself.
        api_key (str, optional): An API key for this request.

    Returns:
        FileTagResponse: A response object from the API.
    """
    from dorsal.session import get_shared_dorsal_client
    from dorsal.file.validators.file_record import NewFileTag

    effective_client = get_shared_dorsal_client()
    if api_key:
        from dorsal.client import DorsalClient

        effective_client = DorsalClient(api_key=api_key)

    try:
        new_tag = NewFileTag(name=name, value=value, private=private)
        tag_result = effective_client.add_tags_to_file(file_hash=hash_string, tags=[new_tag])
        return tag_result
    except (DorsalClientError, ValueError):
        raise

deactivate_model_by_name

deactivate_model_by_name(name, scope='project')

Deactivates (disables) a model in the pipeline by name.

Source code in venv/lib/python3.13/site-packages/dorsal/api/config.py
def deactivate_model_by_name(name: str, scope: Literal["project", "global"] = "project") -> None:
    """Deactivates (disables) a model in the pipeline by name."""
    PipelineConfig.set_step_status_by_name(name=name, active=False, scope=scope)

delete_collection

delete_collection(collection_id, api_key=None)

Deletes a file collection from DorsalHub by its ID.

Parameters:

Name Type Description Default
collection_id str

The unique ID of the collection to delete.

required
api_key str

An API key for this request.

None

Raises:

Type Description
DorsalClientError

For API errors.

DorsalError

For other unexpected library errors.

Source code in venv/lib/python3.13/site-packages/dorsal/api/collection.py
def delete_collection(
    collection_id: str,
    api_key: str | None = None,
) -> None:
    """
    Deletes a file collection from DorsalHub by its ID.

    Args:
        collection_id (str): The unique ID of the collection to delete.
        api_key (str, optional): An API key for this request.

    Raises:
        DorsalClientError: For API errors.
        DorsalError: For other unexpected library errors.
    """
    from dorsal.session import get_shared_dorsal_client
    from dorsal.client import DorsalClient

    if not collection_id:
        raise ValueError("collection_id cannot be empty.")

    effective_client: DorsalClient
    log_message_context = "using shared client"
    if api_key:
        log_message_context = "using temporary client with provided API key"
        effective_client = DorsalClient(api_key=api_key)
    else:
        effective_client = get_shared_dorsal_client()

    logger.debug(
        "Dispatching delete_collections to client (%s) for ID: '%s'",
        log_message_context,
        collection_id,
    )

    try:
        effective_client.delete_collections(collection_ids=[collection_id])
        logger.info("Successfully dispatched deletion for collection '%s'", collection_id)
        return

    except DorsalError as err:
        logger.warning(
            "A client error occurred during delete_dorsal_collection (%s): %s",
            log_message_context,
            err,
        )
        raise
    except Exception as err:
        logger.exception(
            "An unexpected error occurred during delete_dorsal_collection (%s).",
            log_message_context,
        )
        raise DorsalError(f"An unexpected error occurred while deleting collection '{collection_id}': {err}") from err

delete_private_dorsal_file_record

delete_private_dorsal_file_record(
    file_hash,
    *,
    tags="none",
    annotations="none",
    api_key=None
)

Delete the private file record for the given hash.

Parameters:

Name Type Description Default
file_hash str

The SHA-256 hash of the file record to delete.

required
tags DeletionScope | None

Specifies which tags to delete. Options: "all", "public", "private", "none", or None (treated as "none"). Defaults to "none".

'none'
annotations DeletionScope | None

Specifies which annotations to delete. Options: "all", "public", "private", "none", or None (treated as "none"). Defaults to "none".

'none'
api_key str

An API key to use for this request.

None

Returns:

Name Type Description
FileDeleteResponse 'FileDeleteResponse'

An object summarizing the result of the delete operation.

Raises:

Type Description
ValueError

If the provided file_hash is not valid.

DorsalClientError

For API errors (e.g., not found, permission denied).

DorsalError

For other unexpected library errors.

Source code in venv/lib/python3.13/site-packages/dorsal/api/file.py
def delete_private_dorsal_file_record(
    file_hash: str,
    *,
    tags: DeletionScope | None = "none",
    annotations: DeletionScope | None = "none",
    api_key: str | None = None,
) -> "FileDeleteResponse":
    """
    Delete the private file record for the given hash.

    Args:
        file_hash (str): The SHA-256 hash of the file record to delete.
        tags (DeletionScope | None): Specifies which tags to delete.
            Options: "all", "public", "private", "none", or None (treated as "none").
            Defaults to "none".
        annotations (DeletionScope | None): Specifies which annotations to delete.
            Options: "all", "public", "private", "none", or None (treated as "none").
            Defaults to "none".
        api_key (str, optional): An API key to use for this request.

    Returns:
        FileDeleteResponse: An object summarizing the result of the delete operation.

    Raises:
        ValueError: If the provided `file_hash` is not valid.
        DorsalClientError: For API errors (e.g., not found, permission denied).
        DorsalError: For other unexpected library errors.
    """
    logger.info(
        "Requesting deletion of PRIVATE record for hash %s (tags=%s, annotations=%s)",
        file_hash,
        tags or "none",
        annotations or "none",
    )

    return _delete_dorsal_file_record(
        file_hash=file_hash,
        record="private",
        tags=tags if tags is not None else "none",
        annotations=annotations if annotations is not None else "none",
        api_key=api_key,
    )

delete_public_dorsal_file_record

delete_public_dorsal_file_record(
    file_hash,
    *,
    tags="none",
    annotations="none",
    api_key=None
)

Delete the public file record for the given hash.

Parameters:

Name Type Description Default
file_hash str

The SHA-256 hash of the file record to delete.

required
tags DeletionScope | None

Specifies which tags to delete. Options: "all", "public", "private", "none", or None (treated as "none"). Defaults to "none".

'none'
annotations DeletionScope | None

Specifies which annotations to delete. Options: "all", "public", "private", "none", or None (treated as "none"). Defaults to "none".

'none'
api_key str

An API key to use for this request.

None

Returns:

Name Type Description
FileDeleteResponse 'FileDeleteResponse'

An object summarizing the result of the delete operation.

Raises:

Type Description
ValueError

If the provided file_hash is not valid.

DorsalClientError

For API errors (e.g., not found, permission denied).

DorsalError

For other unexpected library errors.

Source code in venv/lib/python3.13/site-packages/dorsal/api/file.py
def delete_public_dorsal_file_record(
    file_hash: str,
    *,
    tags: DeletionScope | None = "none",
    annotations: DeletionScope | None = "none",
    api_key: str | None = None,
) -> "FileDeleteResponse":
    """
    Delete the public file record for the given hash.

    Args:
        file_hash (str): The SHA-256 hash of the file record to delete.
        tags (DeletionScope | None): Specifies which tags to delete.
            Options: "all", "public", "private", "none", or None (treated as "none").
            Defaults to "none".
        annotations (DeletionScope | None): Specifies which annotations to delete.
            Options: "all", "public", "private", "none", or None (treated as "none").
            Defaults to "none".
        api_key (str, optional): An API key to use for this request.

    Returns:
        FileDeleteResponse: An object summarizing the result of the delete operation.

    Raises:
        ValueError: If the provided `file_hash` is not valid.
        DorsalClientError: For API errors (e.g., not found, permission denied).
        DorsalError: For other unexpected library errors.
    """
    logger.info(
        "Requesting deletion of PUBLIC record for hash %s (tags=%s, annotations=%s)",
        file_hash,
        tags or "none",
        annotations or "none",
    )

    return _delete_dorsal_file_record(
        file_hash=file_hash,
        record="public",
        tags=tags if tags is not None else "none",
        annotations=annotations if annotations is not None else "none",
        api_key=api_key,
    )

export_collection

export_collection(
    collection_id,
    output_path,
    poll_interval=5,
    timeout=3600,
    api_key=None,
    console=None,
    palette=None,
)

Exports a remote file collection from DorsalHub.

Starts an export job, polls for its completion, and downloads to a local path.

The exported file will be in .json.gz format.

Parameters:

Name Type Description Default
collection_id str

The ID of the collection to export.

required
output_path str | Path

The local path to save the exported file.

required
poll_interval int

Seconds to wait between status checks.

5
timeout int | None

Total seconds to wait for the job to complete.

3600
api_key str

An API key for this request.

None
console Console

A rich.console.Console for progress display.

None
palette dict

Color palette for the progress bar.

None

Raises:

Type Description
DorsalClientError

For API errors.

DorsalError

For other unexpected library errors.

Source code in venv/lib/python3.13/site-packages/dorsal/api/collection.py
def export_collection(
    collection_id: str,
    output_path: str | pathlib.Path,
    poll_interval: int = 5,
    timeout: int | None = 3600,
    api_key: str | None = None,
    console: "Console | None" = None,
    palette: dict | None = None,
) -> None:
    """
    Exports a remote file collection from DorsalHub.

    Starts an export job, polls for its completion, and downloads to a local path.

    The exported file will be in .json.gz format.

    Args:
        collection_id (str): The ID of the collection to export.
        output_path (str | pathlib.Path): The local path to save the exported file.
        poll_interval (int): Seconds to wait between status checks.
        timeout (int | None): Total seconds to wait for the job to complete.
        api_key (str, optional): An API key for this request.
        console (Console, optional): A rich.console.Console for progress display.
        palette (dict, optional): Color palette for the progress bar.

    Raises:
        DorsalClientError: For API errors.
        DorsalError: For other unexpected library errors.
    """
    from dorsal.session import get_shared_dorsal_client
    from dorsal.client import DorsalClient

    effective_client: DorsalClient
    log_message_context = "using shared client"
    if api_key:
        log_message_context = "using temporary client with provided API key"
        effective_client = DorsalClient(api_key=api_key)
    else:
        effective_client = get_shared_dorsal_client()

    logger.debug(
        "Dispatching export_collection to client (%s) for ID: '%s'",
        log_message_context,
        collection_id,
    )

    try:
        effective_client.export_collection(
            collection_id=collection_id,
            output_path=str(output_path),
            poll_interval=poll_interval,
            timeout=timeout,
            console=console,
            palette=palette,
        )
        logger.info("Successfully exported collection '%s' to '%s'", collection_id, output_path)

    except DorsalError as err:
        logger.warning(
            "A client error occurred during export_dorsal_collection (%s): %s",
            log_message_context,
            err,
        )
        raise
    except Exception as err:
        logger.exception(
            "An unexpected error occurred during export_dorsal_collection (%s).",
            log_message_context,
        )
        raise DorsalError(f"An unexpected error occurred while exporting collection '{collection_id}': {err}") from err

find_duplicates

find_duplicates(
    path,
    recursive=False,
    min_size=0,
    max_size=None,
    mode="hybrid",
    use_cache=True,
    progress_console=None,
    palette=None,
)

Finds duplicate files in a directory using a multi-pass filtering strategy.

Source code in venv/lib/python3.13/site-packages/dorsal/api/file.py
def find_duplicates(
    path: str | pathlib.Path,
    recursive: bool = False,
    min_size: str | int = 0,
    max_size: str | int | None = None,
    mode: Literal["hybrid", "quick", "sha256"] = "hybrid",
    use_cache: bool = True,
    progress_console: "Console" | None = None,
    palette: dict | None = None,
) -> dict:
    """
    Finds duplicate files in a directory using a multi-pass filtering strategy.
    """

    path = pathlib.Path(path)
    min_size_bytes = parse_filesize(min_size) if isinstance(min_size, str) else min_size
    max_size_bytes = parse_filesize(max_size) if isinstance(max_size, str) else max_size

    candidate_files, total_scanned, inaccessible_count = _filter_by_size(
        path, recursive, min_size_bytes, max_size_bytes, progress_console, palette
    )
    if not candidate_files:
        logger.debug("No potential duplicates found based on file size. Finished.")
        return {}

    results = {}
    total_cache_hits = 0

    if mode == "hybrid":
        logger.debug("Pass 2/3: Identifying potential duplicates with QUICK hash...")
        quick_hash_map, quick_cache_hits = _find_duplicates_quick(candidate_files, use_cache, progress_console, palette)
        total_cache_hits += quick_cache_hits

        potential_duplicates = [pathlib.Path(p) for s in quick_hash_map.values() if len(s) > 1 for p in s]

        if not potential_duplicates:
            logger.debug("No potential duplicates found after QUICK hash pass. Finished.")
            results = _format_duplicate_results(path=str(path), hash_map=quick_hash_map)
        else:
            logger.debug(
                "Pass 2 complete. Found %d potential duplicates to verify with SHA-256.",
                len(potential_duplicates),
            )
            logger.debug("Pass 3/3: Verifying duplicates with SHA-256 hash...")
            final_hash_map, sha_cache_hits = _find_duplicates_sha256(
                potential_duplicates, use_cache, progress_console, palette
            )
            total_cache_hits += sha_cache_hits
            results = _format_duplicate_results(path=str(path), hash_map=final_hash_map)

    elif mode == "quick":
        logger.warning("Using 'quick' mode. Results may include false positives.")
        logger.debug("Pass 2/2: Identifying potential duplicates with QUICK hash...")
        quick_hash_map, total_cache_hits = _find_duplicates_quick(candidate_files, use_cache, progress_console, palette)
        results = _format_duplicate_results(path=str(path), hash_map=quick_hash_map)

    elif mode == "sha256":
        logger.debug("Pass 2/2: Identifying duplicates with SHA-256 hash...")
        sha256_hash_map, total_cache_hits = _find_duplicates_sha256(
            candidate_files, use_cache, progress_console, palette
        )
        results = _format_duplicate_results(path=str(path), hash_map=sha256_hash_map)

    else:
        raise ValueError(f"Invalid mode '{mode}'. Must be one of 'hybrid', 'quick', or 'sha256'.")

    results["hashes_from_cache"] = total_cache_hits

    if results and results.get("total_sets", 0) > 0:
        total_duplicate_files = sum(s["count"] for s in results.get("duplicate_sets", []))
        logger.debug(
            "Duplicate search complete. Scanned %d files (%d inaccessible, %d hashes from cache). "
            "Found %d sets of duplicates, comprising %d total files.",
            total_scanned,
            inaccessible_count,
            total_cache_hits,
            results["total_sets"],
            total_duplicate_files,
        )
    else:
        logger.debug("Duplicate search complete. No duplicate sets were found.")

    return results

generate_html_directory_report

generate_html_directory_report(
    dir_path,
    output_path=None,
    *,
    local_collection=None,
    template="default",
    use_cache=True,
    recursive=False
)

Generates a self-contained HTML dashboard for a directory of files.

This function orchestrates the creation of a rich, interactive HTML document. It processes a directory into a LocalFileCollection, generates data for various UI panel based on user configuration, and renders the result using a flexible Jinja2 template system.

Parameters:

Name Type Description Default
dir_path str

The path to the local directory to report on.

required
local_collection LocalFileCollection

An existing, pre-processed collection can be passed to avoid re-scanning the directory.

None
output_path str

If provided, the HTML dashboard will be saved to this file path.

None
template str

The name of the template to use. Defaults to "default".

'default'
use_cache bool

Whether to use the local cache during file processing.

True
recursive bool

Whether to scan the directory recursively.

False

Returns:

Name Type Description
str str | None

The generated HTML dashboard as a string.

Raises:

Type Description
DorsalError

If file processing or report generation fails.

TemplateNotFoundError

If the specified template cannot be located.

FileNotFoundError

If the specified dir_path does not exist.

Source code in venv/lib/python3.13/site-packages/dorsal/api/file.py
def generate_html_directory_report(
    dir_path: str,
    output_path: str | None = None,
    *,
    local_collection: LocalFileCollection | None = None,
    template: str = "default",
    use_cache: bool = True,
    recursive: bool = False,
) -> str | None:
    """
    Generates a self-contained HTML dashboard for a directory of files.

    This function orchestrates the creation of a rich, interactive HTML document.
    It processes a directory into a LocalFileCollection, generates data for various
    UI panel based on user configuration, and renders the result using a
    flexible Jinja2 template system.

    Args:
        dir_path (str): The path to the local directory to report on.
        local_collection (LocalFileCollection, optional): An existing, pre-processed
            collection can be passed to avoid re-scanning the directory.
        output_path (str, optional): If provided, the HTML dashboard will be saved
            to this file path.
        template (str, optional): The name of the template to use. Defaults to "default".
        use_cache (bool, optional): Whether to use the local cache during file processing.
        recursive (bool, optional): Whether to scan the directory recursively.

    Returns:
        str: The generated HTML dashboard as a string.

    Raises:
        DorsalError: If file processing or report generation fails.
        TemplateNotFoundError: If the specified template cannot be located.
        FileNotFoundError: If the specified `dir_path` does not exist.
    """
    from jinja2 import Environment, FileSystemLoader
    from dorsal.common.config import get_collection_report_panel_config
    from dorsal.file.collection.local import LocalFileCollection
    from dorsal.file.utils.reports import REPORT_DATA_GENERATORS, resolve_template_path
    from dorsal.templates.file.icons import get_media_type_icon
    from dorsal.version import __version__
    import datetime
    import html
    import json
    import pathlib

    logger.debug(f"Generating HTML dashboard for: '{dir_path}' using template: '{template}'")
    try:
        if local_collection is None:
            collection = LocalFileCollection(
                source=dir_path,
                recursive=recursive,
                use_cache=use_cache,
            )
        else:
            collection = local_collection

        panel_config = get_collection_report_panel_config()
        enabled_panels = [name for name, is_enabled in panel_config.items() if is_enabled]

        panels_to_render = []
        for panel_id in enabled_panels:
            generator_func = REPORT_DATA_GENERATORS.get(panel_id)
            if generator_func:
                logger.debug(f"Generating data for panel: {panel_id}")
                panel_data = generator_func(collection)
                panels_to_render.append(
                    {
                        "id": panel_id,
                        "title": panel_id.replace("_", " ").title(),
                        "data": panel_data,
                    }
                )
            else:
                logger.warning(f"No data generator found for configured panel: {panel_id}")

        template_file, template_base_dir = resolve_template_path(report_type="collection", name_or_path=template)

        env = Environment(loader=FileSystemLoader(template_base_dir), autoescape=True)
        env.globals["human_filesize"] = human_filesize
        env.globals["get_media_type_icon"] = get_media_type_icon

        jinja_template = env.get_template(template_file.name)

        collection_dict = collection.to_dict()
        collection_dict["panels"] = panels_to_render

        full_collection_data_json = json.dumps(collection_dict, default=str)

        context = {
            "report_title": f"Directory Report: {html.escape(pathlib.Path(dir_path).name)}",
            "collection_source_path": dir_path,
            "generation_date": datetime.datetime.now(datetime.timezone.utc).strftime("%Y-%m-%d %H:%M:%S UTC"),
            "dorsal_version": __version__,
            "panels": panels_to_render,
            "full_collection_data_json": full_collection_data_json,
        }

        html_content = jinja_template.render(context)

        if output_path:
            output_file = pathlib.Path(output_path)
            output_file.parent.mkdir(parents=True, exist_ok=True)
            with open(output_file, "w", encoding="utf-8") as f:
                f.write(html_content)
            logger.info(f"HTML dashboard saved to: {output_path}")
            return None

        return html_content
    except Exception as e:
        logger.exception(f"Failed to generate HTML dashboard for '{dir_path}'.")
        if isinstance(e, (DorsalError, FileNotFoundError)):
            raise
        raise DorsalError(f"Could not generate HTML dashboard for {dir_path}: {e}") from e

generate_html_file_report

generate_html_file_report(
    file_path,
    *,
    local_file=None,
    output_path=None,
    template="default",
    use_cache=True,
    api_key=None
)

Generates a self-contained HTML report for a single local file.

This function serves as a high-level entry point to the reporting engine. It leverages scan_file to perform a full metadata extraction and then renders the result into a rich, interactive HTML document using a flexible, user-configurable Jinja2 template system. The final output is a single, portable HTML file with all CSS and JavaScript embedded.

Example
from dorsal.api import generate_html_file_report

# Generate the report and save it to a file
generate_html_file_report(
    "path/to/my_document.pdf",
    output_path="report.html"
)

# Generate a report using a custom template and get the HTML as a string
html_content = generate_html_file_report(
    "path/to/archive.zip",
    template="compact"
)

Parameters:

Name Type Description Default
file_path str

The path to the local file to report on.

required
output_path str

If provided, the HTML report will be saved to this file path. Defaults to None.

None
template str

The name of a built-in/user-defined template or an absolute path to a custom template .html file. Defaults to "default".

'default'
use_cache bool

Whether to use the local cache during file processing. Defaults to True.

True
api_key str

An API key for operations that may require it. Defaults to None.

None

Returns:

Name Type Description
str str | None

The generated HTML report as a string.

Raises:

Type Description
DorsalError

If file processing or report generation fails.

TemplateNotFoundError

If the specified template cannot be located.

FileNotFoundError

If the specified file_path does not exist.

Source code in venv/lib/python3.13/site-packages/dorsal/api/file.py
def generate_html_file_report(
    file_path: str,
    *,
    local_file: LocalFile | None = None,
    output_path: str | None = None,
    template: str = "default",
    use_cache: bool = True,
    api_key: str | None = None,
) -> str | None:
    """
    Generates a self-contained HTML report for a single local file.

    This function serves as a high-level entry point to the reporting engine. It
    leverages `scan_file` to perform a full metadata extraction and then renders
    the result into a rich, interactive HTML document using a flexible,
    user-configurable Jinja2 template system. The final output is a single,
    portable HTML file with all CSS and JavaScript embedded.

    Example:
        ```python
        from dorsal.api import generate_html_file_report

        # Generate the report and save it to a file
        generate_html_file_report(
            "path/to/my_document.pdf",
            output_path="report.html"
        )

        # Generate a report using a custom template and get the HTML as a string
        html_content = generate_html_file_report(
            "path/to/archive.zip",
            template="compact"
        )
        ```

    Args:
        file_path (str): The path to the local file to report on.
        output_path (str, optional): If provided, the HTML report will be saved
            to this file path. Defaults to None.
        template (str, optional): The name of a built-in/user-defined template
            or an absolute path to a custom template .html file.
            Defaults to "default".
        use_cache (bool, optional): Whether to use the local cache during file
            processing. Defaults to True.
        api_key (str, optional): An API key for operations that may require it.
            Defaults to None.

    Returns:
        str: The generated HTML report as a string.

    Raises:
        DorsalError: If file processing or report generation fails.
        TemplateNotFoundError: If the specified template cannot be located.
        FileNotFoundError: If the specified `file_path` does not exist.
    """
    from jinja2 import Environment, FileSystemLoader
    from dorsal.templates.file.icons import get_media_type_icon
    from dorsal.version import __version__

    logger.debug(f"Generating HTML report for: '{file_path}' using template: '{template}'")
    try:
        if local_file is None:
            local_file = scan_file(file_path, use_cache=use_cache, api_key=api_key)

        template_file, template_base_dir = resolve_template_path(report_type="file", name_or_path=template)

        env = Environment(loader=FileSystemLoader(template_base_dir), autoescape=True)
        env.globals["human_filesize"] = human_filesize
        env.globals["get_media_type_icon"] = get_media_type_icon

        jinja_template = env.get_template(template_file.name)
        file_dict = local_file.to_dict(mode="json")

        base_info = file_dict.get("annotations", {}).get("file/base", {}).get("record", {})
        file_size_info = {
            "human": human_filesize(base_info.get("size", 0)),
            "raw": f"{base_info.get('size', 0)} bytes",
        }

        local_fs_info = {
            "full_path": local_file._file_path,
            "date_created": {
                "human": local_file.date_created.strftime("%Y-%m-%d %H:%M:%S"),
                "raw": local_file.date_created.isoformat(),
            },
            "date_modified": {
                "human": local_file.date_modified.strftime("%Y-%m-%d %H:%M:%S"),
                "raw": local_file.date_modified.isoformat(),
            },
        }

        context = {
            "report_title": f"Dorsal Report: {html.escape(base_info.get('name', 'Untitled File'))}",
            "generation_date": datetime.datetime.now(datetime.timezone.utc).strftime("%Y-%m-%d %H:%M:%S UTC"),
            "file": file_dict,
            "file_size": file_size_info,
            "raw_data_json": json.dumps(file_dict, indent=2, default=str),
            "local_filesystem_info": local_fs_info,
            "dorsal_version": __version__,
        }

        html_content = jinja_template.render(context)

        if output_path:
            output_file = pathlib.Path(output_path)
            output_file.parent.mkdir(parents=True, exist_ok=True)
            with open(output_file, "w", encoding="utf-8") as f:
                f.write(html_content)
            logger.info(f"HTML file report saved to: {output_path}")
            return None

        return html_content
    except Exception as e:
        logger.exception(f"Failed to generate HTML file report for '{file_path}'.")
        if isinstance(e, (DorsalError, FileNotFoundError)):
            raise
        raise DorsalError(f"Could not generate HTML report for {file_path}: {e}") from e

get_collection

get_collection(
    collection_id: str,
    hydrate: Literal[True],
    page: int = 1,
    per_page: int = 30,
    api_key: str | None = None,
    *,
    mode: Literal["pydantic"]
) -> HydratedSingleCollectionResponse
get_collection(
    collection_id: str,
    hydrate: Literal[True],
    page: int = 1,
    per_page: int = 30,
    api_key: str | None = None,
    *,
    mode: Literal["dict"]
) -> dict[str, Any]
get_collection(
    collection_id: str,
    hydrate: Literal[True],
    page: int = 1,
    per_page: int = 30,
    api_key: str | None = None,
    *,
    mode: Literal["json"]
) -> str
get_collection(
    collection_id: str,
    hydrate: Literal[False],
    page: int = 1,
    per_page: int = 30,
    api_key: str | None = None,
    *,
    mode: Literal["pydantic"]
) -> SingleCollectionResponse
get_collection(
    collection_id: str,
    hydrate: Literal[False],
    page: int = 1,
    per_page: int = 30,
    api_key: str | None = None,
    *,
    mode: Literal["dict"]
) -> dict[str, Any]
get_collection(
    collection_id: str,
    hydrate: Literal[False] = False,
    page: int = 1,
    per_page: int = 30,
    api_key: str | None = None,
    *,
    mode: Literal["json"]
) -> str
get_collection(
    collection_id,
    hydrate=False,
    page=1,
    per_page=30,
    api_key=None,
    *,
    mode="pydantic"
)

Retrieves a specific collection and its contents from DorsalHub.

Parameters:

Name Type Description Default
collection_id str

The unique ID of the collection to fetch.

required
hydrate bool

If True, returns fully detailed file records. Defaults to False.

False
page int

The page number for file contents. Defaults to 1.

1
per_page int

The number of file records per page. Defaults to 30.

30
api_key str

An API key for this request.

None
mode Literal['pydantic', 'dict', 'json']

The desired return format.

'pydantic'

Returns:

Type Description
SingleCollectionResponse | HydratedSingleCollectionResponse | dict[str, Any] | str

The collection data, formatted according to the specified mode.

Raises:

Type Description
DorsalClientError

For API errors like not found or authentication issues.

DorsalError

For other unexpected library errors.

Source code in venv/lib/python3.13/site-packages/dorsal/api/collection.py
def get_collection(
    collection_id: str,
    hydrate: bool = False,
    page: int = 1,
    per_page: int = 30,
    api_key: str | None = None,
    *,
    mode: Literal["pydantic", "dict", "json"] = "pydantic",
) -> "SingleCollectionResponse | HydratedSingleCollectionResponse | dict[str, Any] | str":
    """
    Retrieves a specific collection and its contents from DorsalHub.

    Args:
        collection_id (str): The unique ID of the collection to fetch.
        hydrate (bool): If True, returns fully detailed file records. Defaults to False.
        page (int): The page number for file contents. Defaults to 1.
        per_page (int): The number of file records per page. Defaults to 30.
        api_key (str, optional): An API key for this request.
        mode (Literal["pydantic", "dict", "json"]): The desired return format.

    Returns:
        The collection data, formatted according to the specified `mode`.

    Raises:
        DorsalClientError: For API errors like not found or authentication issues.
        DorsalError: For other unexpected library errors.
    """
    from dorsal.session import get_shared_dorsal_client
    from dorsal.client import DorsalClient

    effective_client: DorsalClient
    log_message_context = "using shared client"
    if api_key:
        log_message_context = "using temporary client with provided API key"
        effective_client = DorsalClient(api_key=api_key)
    else:
        effective_client = get_shared_dorsal_client()

    logger.debug(
        "Dispatching get_collection to client (%s) for ID: '%s', hydrate=%s",
        log_message_context,
        collection_id,
        hydrate,
    )
    response: SingleCollectionResponse | HydratedSingleCollectionResponse
    try:
        if hydrate:
            response = effective_client.get_collection(
                collection_id=collection_id,
                hydrate=True,
                page=page,
                per_page=per_page,
            )
        else:
            response = effective_client.get_collection(
                collection_id=collection_id,
                hydrate=False,
                page=page,
                per_page=per_page,
            )

        if mode == "pydantic":
            return response
        if mode == "dict":
            return response.model_dump(mode="json", by_alias=True, exclude_none=True)
        if mode == "json":
            return response.model_dump_json(indent=2, by_alias=True, exclude_none=True)

        raise ValueError(f"Invalid mode: '{mode}'.")

    except DorsalError as err:
        logger.warning(
            "A client error occurred during get_dorsal_collection (%s): %s",
            log_message_context,
            err,
        )
        raise
    except Exception as err:
        logger.exception(
            "An unexpected error occurred during get_dorsal_collection (%s).",
            log_message_context,
        )
        raise DorsalError(f"An unexpected error occurred while getting collection '{collection_id}': {err}") from err

get_dataset

get_dataset(dataset_id, api_key=None, client=None)

Retrieves the full definition of an existing dataset from DorsalHub.

Fetches a dataset's metadata, including its name, description, schema, and other properties.

Example
from dorsal.api import get_dataset

try:
    # Fetch a public dataset from the 'dorsal' namespace
    dataset = get_dataset("dorsal/arxiv-cs-papers")
    print(f"Dataset Name: {dataset.name}")
    print(f"Description: {dataset.description}")
except Exception as e:
    print(f"Could not retrieve dataset: {e}")

Parameters:

Name Type Description Default
dataset_id str

The unique identifier for the dataset, in the format "namespace/dataset-name".

required
api_key str

An API key to use for this request, especially for private datasets. Defaults to None.

None

Returns:

Name Type Description
Dataset Dataset

A Pydantic model instance representing the full dataset definition.

Raises:

Type Description
NotFoundError

If no dataset with the specified ID is found.

DorsalClientError

If the API call fails for any other reason.

Source code in venv/lib/python3.13/site-packages/dorsal/api/dataset.py
def get_dataset(dataset_id: str, api_key: str | None = None, client: DorsalClient | None = None) -> Dataset:
    """Retrieves the full definition of an existing dataset from DorsalHub.

    Fetches a dataset's metadata, including its name, description, schema,
    and other properties.

    Example:
        ```python
        from dorsal.api import get_dataset

        try:
            # Fetch a public dataset from the 'dorsal' namespace
            dataset = get_dataset("dorsal/arxiv-cs-papers")
            print(f"Dataset Name: {dataset.name}")
            print(f"Description: {dataset.description}")
        except Exception as e:
            print(f"Could not retrieve dataset: {e}")
        ```

    Args:
        dataset_id (str): The unique identifier for the dataset, in the
            format "namespace/dataset-name".
        api_key (str, optional): An API key to use for this request, especially
            for private datasets. Defaults to None.

    Returns:
        Dataset: A Pydantic model instance representing the full dataset definition.

    Raises:
        NotFoundError: If no dataset with the specified ID is found.
        DorsalClientError: If the API call fails for any other reason.
    """
    logger.debug(
        "get_dataset called with id: '%s', api_key provided: %s",
        dataset_id,
        "Yes" if api_key else "No",
    )

    if client is None:
        try:
            client = get_shared_dorsal_client(api_key=api_key)
        except Exception as err:
            logger.exception("Failed to obtain shared DorsalClient instance.")
            raise DorsalClientError("Could not initialize or retrieve the API client.") from err

    try:
        dataset_response = client.get_dataset(dataset_id=dataset_id, api_key=api_key)
        logger.debug(
            "Successfully retrieved dataset ID: '%s'",
            dataset_id,
        )
        return dataset_response
    except (
        ValueError,
        DorsalClientError,
    ) as err:
        logger.warning(
            "API call to retrieve dataset (ID: '%s') failed: %s - %s",
            dataset_id,
            type(err).__name__,
            err,
        )
        raise
    except Exception as err:
        logger.exception(
            "Unexpected error during client.get_dataset (ID: '%s').",
            dataset_id,
        )
        raise DorsalClientError(
            f"An unexpected issue occurred while fetching dataset '{dataset_id}' via the client."
        ) from err

get_dataset_schema

get_dataset_schema(dataset_id, api_key=None, client=None)

Fetches the JSON schema for a given dataset.

Example
from dorsal.api import get_dataset_schema

try:
    schema = get_dataset_schema("dorsal/arxiv-cs-papers")
    print("Schema properties:")
    for prop in schema.get("properties", {}):
        print(f"- {prop}")
except Exception as e:
    print(f"Could not retrieve schema: {e}")

Parameters:

Name Type Description Default
dataset_id str

Identifier for a dataset (e.g., "dorsal/arxiv").

required
api_key str

An API key for this request.

None

Returns:

Type Description
dict

dict[str, Any]: The JSON schema of the dataset.

Raises:

Type Description
ValueError

If dataset_id is invalid (propagated from get_dataset).

DorsalClientError

Base client error or for unexpected issues (propagated from get_dataset).

AuthError

Authentication failure (propagated from get_dataset).

NotFoundError

If the dataset is not found (propagated from get_dataset).

ForbiddenError

Access to the dataset is denied (propagated from get_dataset).

RateLimitError

If the request is rate-limited by the API (propagated from get_dataset).

NetworkError

If a network issue occurs (propagated from get_dataset).

APIError

For other HTTP errors from the API (propagated from get_dataset).

ApiDataValidationError

If the API response for the dataset is malformed and cannot be parsed into a valid Dataset object (propagated from get_dataset).

Source code in venv/lib/python3.13/site-packages/dorsal/api/dataset.py
def get_dataset_schema(dataset_id: str, api_key: str | None = None, client: DorsalClient | None = None) -> dict:
    """Fetches the JSON schema for a given dataset.

    Example:
        ```python
        from dorsal.api import get_dataset_schema

        try:
            schema = get_dataset_schema("dorsal/arxiv-cs-papers")
            print("Schema properties:")
            for prop in schema.get("properties", {}):
                print(f"- {prop}")
        except Exception as e:
            print(f"Could not retrieve schema: {e}")
        ```

    Args:
        dataset_id (str): Identifier for a dataset (e.g., "dorsal/arxiv").
        api_key (str, optional): An API key for this request.

    Returns:
        dict[str, Any]: The JSON schema of the dataset.

    Raises:
        ValueError: If `dataset_id` is invalid (propagated from `get_dataset`).
        DorsalClientError: Base client error or for unexpected issues (propagated
                           from `get_dataset`).
        AuthError: Authentication failure (propagated from `get_dataset`).
        NotFoundError: If the dataset is not found (propagated from `get_dataset`).
        ForbiddenError: Access to the dataset is denied (propagated from `get_dataset`).
        RateLimitError: If the request is rate-limited by the API (propagated from
                        `get_dataset`).
        NetworkError: If a network issue occurs (propagated from `get_dataset`).
        APIError: For other HTTP errors from the API (propagated from `get_dataset`).
        ApiDataValidationError: If the API response for the dataset is malformed and
                                cannot be parsed into a valid `Dataset` object
                                (propagated from `get_dataset`).
    """
    logger.debug(
        "Attempting to fetch schema for dataset_id: '%s'. API key used: %s",
        dataset_id,
        "Yes (user-provided)" if api_key else "No (client default)",
    )
    if client is None:
        try:
            client = get_shared_dorsal_client()
        except Exception as err:
            logger.exception("Failed to obtain shared DorsalClient instance.")
            raise DorsalClientError("Could not initialize or retrieve the API client.") from err

    try:
        schema = client.get_dataset_schema(dataset_id=dataset_id)
        logger.debug("Successfully retrieved schema via client for dataset_id: '%s'", dataset_id)
        return schema
    except (ValueError, DorsalClientError) as err:
        logger.warning(
            "Client call to retrieve schema for dataset_id '%s' failed: %s - %s",
            dataset_id,
            type(err).__name__,
            err,
        )
        raise
    except Exception as err:
        logger.exception(
            "Unexpected error during client.get_dataset_schema call for dataset_id: '%s'.",
            dataset_id,
        )
        raise DorsalClientError(
            message=f"An unexpected issue occurred while fetching schema for dataset '{dataset_id}' via the client."
        ) from err

get_directory_info

get_directory_info(
    dir_path,
    recursive=False,
    media_type=True,
    progress_console=None,
    palette=None,
)

Calculates and returns a detailed summary of a directory using a single-pass method.

Source code in venv/lib/python3.13/site-packages/dorsal/api/file.py
def get_directory_info(
    dir_path: str,
    recursive: bool = False,
    media_type: bool = True,
    progress_console: "Console" | None = None,
    palette: dict[str, str] | None = None,
) -> _DirectoryInfoResult:
    """
    Calculates and returns a detailed summary of a directory using a single-pass method.
    """
    if not os.path.isdir(dir_path):
        raise NotADirectoryError(f"The specified path is not a directory: {dir_path}")

    metrics = _DirectoryMetrics(media_type_enabled=media_type)
    start_time = time.perf_counter()

    tqdm_bar = None
    rich_progress = None
    task_id = None

    if is_jupyter_environment():
        from tqdm import tqdm

        tqdm_bar = tqdm(desc="Analyzing directory", unit="file")
    elif progress_console:
        rich_progress = _create_rich_progress(progress_console, palette)
        task_id = rich_progress.add_task("Analyzing directory...", total=None)

    progress_manager = rich_progress if rich_progress else open(os.devnull, "w")
    with progress_manager:
        for root, dirs, files in os.walk(dir_path, topdown=True):
            metrics.total_dirs += len(dirs)

            for name in files:
                metrics.total_files += 1

                if rich_progress and task_id is not None:
                    rich_progress.update(task_id, advance=1)
                elif tqdm_bar:
                    tqdm_bar.update(1)

                metrics.process_file(pathlib.Path(root) / name)

            if not recursive:
                dirs.clear()

    if tqdm_bar:
        tqdm_bar.close()

    duration = time.perf_counter() - start_time
    logger.info("Directory analysis for '%s' completed in %.2f seconds.", dir_path, duration)

    return _format_results(metrics, duration)

get_dorsal_file_record

get_dorsal_file_record(
    hash_string: str,
    mode: Literal["pydantic"],
    private: bool | None = None,
    api_key: str | None = None,
) -> "FileRecord"
get_dorsal_file_record(
    hash_string: str,
    mode: Literal["dict"],
    private: bool | None = None,
    api_key: str | None = None,
) -> dict[str, Any]
get_dorsal_file_record(
    hash_string: str,
    mode: Literal["json"],
    private: bool | None = None,
    api_key: str | None = None,
) -> str
get_dorsal_file_record(
    hash_string, mode="pydantic", private=None, api_key=None
)

Gets metadata for a file record from DorsalHub

  • private=None (Default): check for a public record first, and if not found, check for a private one.
  • private=True: get public record
  • private=False: get private record
Example
from dorsal.api import get_file_metadata

# Agnostic search (recommended)
agnostic_file = get_file_metadata("some_hash")

# Public-only search
public_file = get_file_metadata("some_hash", private=False)

# Private-only search as a dictionary
private_file_dict = get_file_metadata("some_hash", private=True, mode="dict")

Parameters:

Name Type Description Default
hash_string str

The hash of the file to fetch (e.g., "sha256:...").

required
mode Literal['pydantic', 'dict', 'json']

The desired return format. Defaults to "pydantic", returning a DorsalFile object.

'pydantic'
private Optional[bool]

Controls the search visibility. Defaults to None (agnostic search).

None
api_key str

An API key to use for this request, overriding any globally configured key. Defaults to None.

None
Source code in venv/lib/python3.13/site-packages/dorsal/api/file.py
def get_dorsal_file_record(
    hash_string: str,
    mode: Literal["pydantic", "dict", "json"] = "pydantic",
    private: bool | None = None,
    api_key: str | None = None,
) -> "FileRecord | dict[str, Any] | str":
    """
    Gets metadata for a file record from DorsalHub

    - `private=None` (Default): check for a public record first, and if not found, check for a private one.
    - `private=True`: get public record
    - `private=False`: get private record

    Example:
        ```python
        from dorsal.api import get_file_metadata

        # Agnostic search (recommended)
        agnostic_file = get_file_metadata("some_hash")

        # Public-only search
        public_file = get_file_metadata("some_hash", private=False)

        # Private-only search as a dictionary
        private_file_dict = get_file_metadata("some_hash", private=True, mode="dict")
        ```

    Args:
        hash_string (str): The hash of the file to fetch (e.g., "sha256:...").
        mode (Literal["pydantic", "dict", "json"], optional): The desired return
            format. Defaults to "pydantic", returning a `DorsalFile` object.
        private (Optional[bool], optional): Controls the search visibility.
            Defaults to None (agnostic search).
        api_key (str, optional): An API key to use for this request, overriding
            any globally configured key. Defaults to None.

    """
    from dorsal.session import get_shared_dorsal_client

    search_strategy = (
        "Agnostic (Private, then Public)" if private is None else ("Private-only" if private else "Public-only")
    )
    log_message_context = "using default client"

    effective_client = get_shared_dorsal_client()
    if api_key:
        from dorsal.client import DorsalClient

        log_message_context = "using temporary client with provided API key"
        effective_client = DorsalClient(api_key=api_key)

    logger.debug(
        "Getting file metadata for hash: '%s'. Mode: %s, Search: %s, %s",
        hash_string,
        mode,
        search_strategy,
        log_message_context,
    )

    try:
        cleaned_hash_string = hash_string.strip() if isinstance(hash_string, str) else ""
        if not cleaned_hash_string:
            raise ValueError("hash_string must be a non-empty string.")

        file_record = effective_client.download_file_record(
            hash_string=cleaned_hash_string,
            private=private,
        )

        if mode == "pydantic":
            return file_record
        if mode == "dict":
            return file_record.model_dump(mode="json", by_alias=True, exclude_none=True)
        if mode == "json":
            return file_record.model_dump_json(indent=2, by_alias=True, exclude_none=True)

        raise ValueError(f"Invalid mode: '{mode}'.")

    except (TypeError, ValueError) as err:
        logger.warning(
            "Input validation error in get_dorsal_file_record (hash: '%s', search: %s, %s): %s",
            hash_string,
            search_strategy,
            log_message_context,
            err,
        )
        raise
    except DorsalClientError as err:
        if isinstance(err.original_exception, NotFoundError):
            err.message = f"File not found in '{search_strategy}' scope for hash '{cleaned_hash_string}'."

        logger.warning(
            "DorsalClientError during get_dorsal_file_record (hash: '%s', search: %s, %s): %s",
            hash_string,
            search_strategy,
            log_message_context,
            err,
        )
        raise
    except Exception as err:
        logger.exception(
            "Unexpected error in get_dorsal_file_record for hash '%s' (%s).",
            hash_string,
            log_message_context,
        )
        if isinstance(err, DorsalError):
            raise
        raise DorsalError(f"An unexpected error occurred while getting metadata for hash '{hash_string}'.") from err

identify_file

identify_file(
    file_path: str,
    quick: bool = True,
    file_size: int | None = None,
    *,
    mode: Literal["pydantic"],
    api_key: str | None = None,
    use_cache: bool = True
) -> "FileRecord"
identify_file(
    file_path: str,
    quick: bool = True,
    file_size: int | None = None,
    *,
    mode: Literal["dict"],
    api_key: str | None = None,
    use_cache: bool = True
) -> dict[str, Any]
identify_file(
    file_path: str,
    quick: bool = True,
    file_size: int | None = None,
    *,
    mode: Literal["json"],
    api_key: str | None = None,
    use_cache: bool = True
) -> str
identify_file(
    file_path,
    quick=True,
    file_size=None,
    *,
    mode="pydantic",
    api_key=None,
    use_cache=True
)

Gets metadata for a local file from DorsalHub using its content hash.

This function identifies a file by first calculating its hash locally and then querying the remote API for a matching record. It provides an efficient "quick hash" option for large files and falls back to a secure SHA-256 hash. Uses local cache to avoid re-calculating hashes on subsequent calls to same file path.

Example
from dorsal.api import identify_file

try:
    # Identify a file, using the fast "quick hash" if possible
    record = identify_file("path/to/my_video.mp4", mode="dict")
    print(f"Successfully identified '{record['name']}'")
    print(f"Dorsal URL: {record['url']}")

except FileNotFoundError:
    print("Error: The file could not be found at that path.")
except DorsalClientError as e:
    # Catches errors like record not found on the server
    print(f"API Error: {e.message}")

Parameters:

Name Type Description Default
file_path str

The path to the local file to identify.

required
quick bool

If True, attempts to use the faster "quick hash" for files >= 32MiB. Defaults to True.

True
file_size int

An optional pre-calculated file size in bytes. If not provided, it will be calculated. Defaults to None.

None
mode Literal['pydantic', 'dict', 'json']

The desired return format. Defaults to "pydantic".

'pydantic'
api_key str

An API key for this request, overriding the client's default. Defaults to None.

None

Returns:

Type Description
FileRecord | dict[str, Any] | str

Union[FileRecord, dict, str]: The file record from DorsalHub, formatted according to the specified mode.

Raises:

Type Description
FileNotFoundError

If the file does not exist at the specified path.

DorsalClientError

For API-level errors, such as NotFoundError if no record matches the hash.

DorsalError

For other unexpected library errors.

Source code in venv/lib/python3.13/site-packages/dorsal/api/file.py
def identify_file(
    file_path: str,
    quick: bool = True,
    file_size: int | None = None,
    *,
    mode: Literal["pydantic", "dict", "json"] = "pydantic",
    api_key: str | None = None,
    use_cache: bool = True,
) -> FileRecord | dict[str, Any] | str:
    """Gets metadata for a local file from DorsalHub using its content hash.

    This function identifies a file by first calculating its hash locally and then
    querying the remote API for a matching record. It provides an efficient
    "quick hash" option for large files and falls back to a secure SHA-256 hash.
    Uses local cache to avoid re-calculating hashes on subsequent calls to same file path.

    Example:
        ```python
        from dorsal.api import identify_file

        try:
            # Identify a file, using the fast "quick hash" if possible
            record = identify_file("path/to/my_video.mp4", mode="dict")
            print(f"Successfully identified '{record['name']}'")
            print(f"Dorsal URL: {record['url']}")

        except FileNotFoundError:
            print("Error: The file could not be found at that path.")
        except DorsalClientError as e:
            # Catches errors like record not found on the server
            print(f"API Error: {e.message}")
        ```

    Args:
        file_path (str): The path to the local file to identify.
        quick (bool, optional): If True, attempts to use the faster "quick hash"
            for files >= 32MiB. Defaults to True.
        file_size (int, optional): An optional pre-calculated file size in bytes.
            If not provided, it will be calculated. Defaults to None.
        mode (Literal["pydantic", "dict", "json"], optional): The desired return
            format. Defaults to "pydantic".
        api_key (str, optional): An API key for this request, overriding the
            client's default. Defaults to None.

    Returns:
        Union[FileRecord, dict, str]: The file record from DorsalHub, formatted
            according to the specified `mode`.

    Raises:
        FileNotFoundError: If the file does not exist at the specified path.
        DorsalClientError: For API-level errors, such as `NotFoundError` if
            no record matches the hash.
        DorsalError: For other unexpected library errors.
    """
    from dorsal.session import get_shared_dorsal_client
    from dorsal.client import DorsalClient

    log_message_context = "using default client"
    effective_client: DorsalClient = get_shared_dorsal_client()
    if api_key:
        log_message_context = "using temporary client with provided API key"
        effective_client = DorsalClient(api_key=api_key)

    logger.debug(
        "Identifying file record for path: '%s' (quick=%s, %s)",
        file_path,
        quick,
        log_message_context,
    )

    try:
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"File not found at path: {file_path}")

        file_record = None
        secure_hash_key = ""
        cache = get_shared_cache() if use_cache else None

        if quick:
            if file_size is None:
                file_size = get_filesize(file_path=file_path)

            if file_size >= QuickHasher.min_permitted_filesize:
                quick_hash_val = None
                if cache:
                    quick_hash_val = get_cached_hash(
                        file_path=file_path,
                        cache=cache,
                        hash_callable=lambda p: get_quick_hash(p, fallback_to_sha256=False, file_size=file_size),
                        hash_function="QUICK",
                    )
                else:
                    quick_hash_val = get_quick_hash(
                        file_path=file_path,
                        fallback_to_sha256=False,
                        file_size=file_size,
                    )

                if quick_hash_val:
                    quick_hash_key = f"QUICK:{quick_hash_val}"
                    logger.debug(
                        "Attempting to identify file with Quick Hash: %s",
                        quick_hash_key,
                    )
                    try:
                        file_record = effective_client.download_file_record(hash_string=quick_hash_key)
                    except ConflictError:
                        logger.warning(
                            "Quick Hash collision for '%s'. Falling back to SHA-256.",
                            quick_hash_key,
                        )
                    except NotFoundError:
                        logger.debug("Quick Hash record not found. Falling back to SHA-256.")
            else:
                logger.debug("File size is less than 32MiB. Skipping Quick Hash.")

        if file_record is None:
            secure_hash_val = None
            if cache:
                secure_hash_val = get_cached_hash(
                    file_path=file_path,
                    cache=cache,
                    hash_callable=get_sha256_hash,
                    hash_function="SHA-256",
                )
            else:
                secure_hash_val = get_sha256_hash(file_path=file_path)

            if not secure_hash_val:
                raise DorsalError(f"Could not generate SHA-256 hash for file: {file_path}")

            secure_hash_key = f"SHA-256:{secure_hash_val}"
            logger.debug("Attempting to identify file with Secure Hash: %s", secure_hash_key)
            file_record = effective_client.download_file_record(hash_string=secure_hash_key)

        if mode == "dict":
            return file_record.model_dump(mode="json", by_alias=True, exclude_none=True)
        if mode == "json":
            return file_record.model_dump_json(indent=2, by_alias=True, exclude_none=True)
        if mode != "pydantic":
            logger.debug("Invalid mode '%s' specified. Returning default model.", mode)  # type: ignore[unreachable]
        return file_record

    except DorsalClientError as err:
        if isinstance(err.original_exception, NotFoundError):
            hash_key = secure_hash_key or "the file's hash"
            err.message = f"No file record was found on DorsalHub matching {hash_key}."
        logger.debug("A client error occurred during identify_file for '%s': %s", file_path, err)
        raise
    except (FileNotFoundError, ValueError) as err:
        logger.error(
            "An input or file system error occurred during identify_file for '%s': %s",
            file_path,
            err,
        )
        raise
    except Exception as err:
        logger.exception("An unexpected error occurred during identify_file for '%s'.", file_path)
        if isinstance(err, DorsalError):
            raise
        raise DorsalError(f"An unexpected error occurred while identifying file '{file_path}'.") from err

index_directory

index_directory(
    dir_path,
    recursive=False,
    *,
    private=True,
    api_key=None,
    use_cache=True
)

Scans a directory and indexes all files to DorsalHub.

This is a powerful, one-shot function that performs a complete workflow: 1. Scans the specified directory for files. 2. Generates rich metadata for each file locally. 3. Uploads all generated metadata records to DorsalHub in managed batches.

Example
from dorsal.api import index_directory

# Scan a directory and index all files to your private records
summary = index_directory("path/to/project_assets", recursive=True, private=True)

print("--- Indexing Complete ---")
print(f"Files processed locally: {summary['total_records_processed_locally']}")
print(f"Successfully indexed to API: {summary['total_records_accepted_by_api']}")

Parameters:

Name Type Description Default
dir_path str

The path to the directory you want to scan and index.

required
recursive bool

If True, scans all subdirectories recursively. Defaults to False.

False
private bool

If True, all file records will be created as private on DorsalHub. Defaults to True.

True
api_key str | None

An API key to use for this operation, overriding the client's default. Defaults to None.

None

Returns:

Name Type Description
dict dict

A summary dictionary detailing the results of the entire batch operation, including local processing and API indexing counts.

Source code in venv/lib/python3.13/site-packages/dorsal/api/file.py
def index_directory(
    dir_path: str,
    recursive: bool = False,
    *,
    private: bool = True,
    api_key: str | None = None,
    use_cache: bool = True,
) -> dict:
    """Scans a directory and indexes all files to DorsalHub.

    This is a powerful, one-shot function that performs a complete workflow:
    1. Scans the specified directory for files.
    2. Generates rich metadata for each file locally.
    3. Uploads all generated metadata records to DorsalHub in managed batches.

    Example:
        ```python
        from dorsal.api import index_directory

        # Scan a directory and index all files to your private records
        summary = index_directory("path/to/project_assets", recursive=True, private=True)

        print("--- Indexing Complete ---")
        print(f"Files processed locally: {summary['total_records_processed_locally']}")
        print(f"Successfully indexed to API: {summary['total_records_accepted_by_api']}")
        ```

    Args:
        dir_path (str): The path to the directory you want to scan and index.
        recursive (bool, optional): If True, scans all subdirectories
            recursively. Defaults to False.
        private (bool, optional): If True, all file records will be created
            as private on DorsalHub. Defaults to True.
        api_key (str | None, optional): An API key to use for this operation,
            overriding the client's default. Defaults to None.

    Returns:
        dict: A summary dictionary detailing the results of the entire batch
            operation, including local processing and API indexing counts.
    """
    from dorsal.file.metadata_reader import MetadataReader

    effective_reader: MetadataReader
    log_message_context = ""

    if api_key is not None:
        log_message_context = "using provided API key with temporary MetadataReader"
        logger.debug(
            "API key override for index_directory (dir: '%s'). Creating temporary MetadataReader.",
            dir_path,
        )
        effective_reader = MetadataReader(api_key=api_key)
    else:
        log_message_context = "using shared METADATA_READER instance"
        logger.debug(
            "No API key override for index_directory (dir: '%s'). Using shared METADATA_READER.",
            dir_path,
        )
        effective_reader = get_metadata_reader()

    logger.debug(
        "High-level index_directory: dir_path='%s' (%s), recursive=%s, private=%s.",
        dir_path,
        log_message_context,
        recursive,
        private,
    )

    all_records_to_index: list[FileRecordStrict]
    file_hash_to_path_map: dict[str, str]

    try:
        logger.debug(
            "Step 1: Generating file records from directory '%s' via MetadataReader.",
            dir_path,
        )
        all_records_to_index, file_hash_to_path_map = effective_reader.generate_processed_records_from_directory(
            dir_path=dir_path, recursive=recursive, skip_cache=not use_cache
        )
        total_records_processed_locally = len(all_records_to_index)
        logger.debug(
            "MetadataReader generated %d unique file records from directory '%s'.",
            total_records_processed_locally,
            dir_path,
        )
    except DorsalError as err:
        logger.warning(
            "Failed to generate file records from directory '%s' (%s): %s - %s",
            dir_path,
            log_message_context,
            type(err).__name__,
            err,
        )
        raise
    except Exception as err:
        logger.exception(
            "Unexpected error generating file records from directory '%s' (%s).",
            dir_path,
            log_message_context,
        )
        if isinstance(err, DorsalError):
            raise
        raise DorsalError(f"Unexpected error processing directory '{dir_path}': {err}") from err

    if not all_records_to_index:
        logger.debug("No unique file records generated from directory '%s' to index.", dir_path)
        return {
            "total_records_processed_locally": 0,
            "total_batches_created": 0,
            "successful_api_batches": 0,
            "failed_api_batches": 0,
            "total_records_accepted_by_api": 0,
            "batch_processing_details": [],
        }

    batches = [
        all_records_to_index[i : i + constants.API_MAX_BATCH_SIZE]
        for i in range(0, total_records_processed_locally, constants.API_MAX_BATCH_SIZE)
    ]
    total_batches_created = len(batches)

    logger.debug(
        "Submitting %d records in %d batches of up to %d each to DorsalHub (private=%s, context: %s).",
        total_records_processed_locally,
        total_batches_created,
        constants.API_MAX_BATCH_SIZE,
        private,
        log_message_context,
    )

    successful_api_batches_count = 0
    failed_api_batches_count = 0
    total_records_accepted_by_api = 0
    batch_processing_details_list = []

    for i, current_batch_records in enumerate(batches):
        batch_number = i + 1
        records_in_this_batch = len(current_batch_records)
        batch_detail_entry: dict = {
            "batch_number": batch_number,
            "records_in_batch": records_in_this_batch,
            "status": "failure",
            "api_response": None,
            "error_message": None,
            "error_type": None,
        }

        logger.debug(
            "Submitting API batch %d of %d (%d records) for directory '%s'.",
            batch_number,
            total_batches_created,
            records_in_this_batch,
            dir_path,
        )
        try:
            batch_api_response: FileIndexResponse
            if private:
                batch_api_response = effective_reader._client.index_private_file_records(
                    file_records=current_batch_records
                )
            else:
                batch_api_response = effective_reader._client.index_public_file_records(
                    file_records=current_batch_records
                )

            if file_hash_to_path_map and batch_api_response.results:
                for result_item in batch_api_response.results:
                    if hasattr(result_item, "hash") and hasattr(result_item, "file_path"):
                        path = file_hash_to_path_map.get(result_item.hash)
                        if path:
                            result_item.file_path = path

            batch_detail_entry["status"] = "success"
            batch_detail_entry["api_response"] = batch_api_response
            successful_api_batches_count += 1
            total_records_accepted_by_api += batch_api_response.success

            logger.debug(
                "API Batch %d of %d for directory '%s' submitted successfully. API Response: Total=%d, Success=%d.",
                batch_number,
                total_batches_created,
                dir_path,
                batch_api_response.total,
                batch_api_response.success,
            )
        except DorsalError as err:
            logger.warning(
                "API Batch %d of %d for directory '%s' failed: %s - %s",
                batch_number,
                total_batches_created,
                dir_path,
                type(err).__name__,
                err,
            )
            batch_detail_entry["error_message"] = str(err)
            batch_detail_entry["error_type"] = type(err).__name__
            failed_api_batches_count += 1
        except Exception as err:
            logger.exception(
                "Unexpected error submitting API batch %d of %d for directory '%s'.",
                batch_number,
                total_batches_created,
                dir_path,
            )
            batch_detail_entry["error_message"] = f"Unexpected error: {str(err)}"
            batch_detail_entry["error_type"] = type(err).__name__
            failed_api_batches_count += 1

        batch_processing_details_list.append(batch_detail_entry)

    overall_summary = {
        "total_records_processed_locally": total_records_processed_locally,
        "total_batches_created": total_batches_created,
        "successful_api_batches": successful_api_batches_count,
        "failed_api_batches": failed_api_batches_count,
        "total_records_accepted_by_api": total_records_accepted_by_api,
        "batch_processing_details": batch_processing_details_list,
    }

    if failed_api_batches_count > 0:
        logger.warning(
            "Batch indexing for directory '%s' (%s) completed with %d successful and %d failed API batches (out of %d). "
            "Total records accepted by API in successful batches: %d.",
            dir_path,
            log_message_context,
            successful_api_batches_count,
            failed_api_batches_count,
            total_batches_created,
            total_records_accepted_by_api,
        )
    else:
        logger.debug(
            "Batch indexing for directory '%s' (%s) completed successfully. All %d API batches processed. "
            "Total records accepted by API: %d.",
            dir_path,
            log_message_context,
            total_batches_created,
            total_records_accepted_by_api,
        )
    return overall_summary

index_file

index_file(
    file_path, *, private=True, api_key=None, use_cache=True
)

Processes a single local file and uploads its metadata to DorsalHub.

This function provides a simple, one-shot way to get a local file's metadata indexed on the remote server.

Example
from dorsal.api import index_file

try:
    response = index_file("path/to/my_image.jpg", private=True)
    if response.success > 0:
        print("File indexed successfully!")
        print(f"View at: {response.results[0].url}")
except Exception as e:
    print(f"Failed to index file: {e}")

Parameters:

Name Type Description Default
file_path str

The path to the local file to process and index.

required
private bool

If True, the record will be created as private. Defaults to True.

True
api_key str

An API key to use for this specific request. Defaults to None.

None

Returns:

Name Type Description
FileIndexResponse FileIndexResponse

A response object from the API detailing the result of the indexing operation.

Source code in venv/lib/python3.13/site-packages/dorsal/api/file.py
def index_file(
    file_path: str,
    *,
    private: bool = True,
    api_key: str | None = None,
    use_cache: bool = True,
) -> FileIndexResponse:
    """Processes a single local file and uploads its metadata to DorsalHub.

    This function provides a simple, one-shot way to get a local file's
    metadata indexed on the remote server.

    Example:
        ```python
        from dorsal.api import index_file

        try:
            response = index_file("path/to/my_image.jpg", private=True)
            if response.success > 0:
                print("File indexed successfully!")
                print(f"View at: {response.results[0].url}")
        except Exception as e:
            print(f"Failed to index file: {e}")
        ```

    Args:
        file_path (str): The path to the local file to process and index.
        private (bool, optional): If True, the record will be created as private.
            Defaults to True.
        api_key (str, optional): An API key to use for this specific request.
            Defaults to None.

    Returns:
        FileIndexResponse: A response object from the API detailing the
            result of the indexing operation.
    """
    from dorsal.file.metadata_reader import MetadataReader

    metadata_reader: MetadataReader
    log_message_context = ""

    if api_key is not None:
        log_message_context = (
            "using provided API key with temporary MetadataReader (default model/ignore_duplicates settings)"
        )
        logger.debug(
            "API key override provided for index_file (file: '%s'). Creating temporary MetadataReader instance.",
            file_path,
        )
        metadata_reader = MetadataReader(api_key=api_key)

    else:
        log_message_context = "using shared METADATA_READER instance"
        logger.debug(
            "No API key override for index_file (file: '%s'). Using shared METADATA_READER instance.",
            file_path,
        )
        metadata_reader = get_metadata_reader()

    logger.debug(
        "High-level index_file calling effective MetadataReader for file_path='%s' (%s), private=%s.",
        file_path,
        log_message_context,
        private,
    )

    try:
        response = metadata_reader.index_file(file_path=file_path, private=private, skip_cache=not use_cache)
        logger.debug(
            "Effective MetadataReader.index_file completed for file_path='%s'. Response success: %s",
            file_path,
            response.success if hasattr(response, "success") else "N/A",
        )
        return response
    except (FileNotFoundError, IOError, DorsalError) as err:
        logger.warning(
            "Call to effective MetadataReader.index_file for file_path='%s' (%s) failed: %s - %s",
            file_path,
            log_message_context,
            type(err).__name__,
            err,
        )
        raise
    except Exception as err:
        logger.exception(
            "Unexpected error in high-level index_file for file_path='%s' (%s).",
            file_path,
            log_message_context,
        )
        if isinstance(err, DorsalError):
            raise
        raise DorsalError(
            f"An unexpected error occurred while indexing file '{file_path}' ({log_message_context})."
        ) from err

list_collections

list_collections(
    page: int = 1,
    per_page: int = 25,
    api_key: str | None = None,
    *,
    mode: Literal["pydantic"]
) -> CollectionsResponse
list_collections(
    page: int = 1,
    per_page: int = 25,
    api_key: str | None = None,
    *,
    mode: Literal["dict"]
) -> dict[str, Any]
list_collections(
    page: int = 1,
    per_page: int = 25,
    api_key: str | None = None,
    *,
    mode: Literal["json"]
) -> str
list_collections(
    page=1, per_page=25, api_key=None, *, mode="pydantic"
)

Retrieves a paginated list of collections from DorsalHub.

This is a high-level wrapper around the DorsalClient's list_collections method, providing a simple way to access collection information.

Example
from dorsal.api.collection import list_dorsal_collections

# Get the first page of collections as Pydantic objects
response = list_dorsal_collections()
print(f"Found {response.pagination.record_count} total collections.")
for collection in response.results:
    print(f"- {collection.name} (ID: {collection.id})")

# Get the second page as a JSON string
response_json = list_dorsal_collections(page=2, mode="json")
print(response_json)

Parameters:

Name Type Description Default
page int

The page number for pagination. Defaults to 1.

1
per_page int

The number of collections per page. Defaults to 25.

25
api_key str

An API key for this request, overriding the client's default. Defaults to None.

None
mode Literal['pydantic', 'dict', 'json']

The desired return format. Defaults to "pydantic".

'pydantic'

Returns:

Type Description
CollectionsResponse | dict[str, Any] | str

Union[CollectionsResponse, dict, str]: The search results, formatted according to the specified mode.

Raises:

Type Description
DorsalClientError

For client-side validation errors or API errors like authentication or rate limiting.

DorsalError

For other unexpected library errors.

Source code in venv/lib/python3.13/site-packages/dorsal/api/collection.py
def list_collections(
    page: int = 1,
    per_page: int = 25,
    api_key: str | None = None,
    *,
    mode: Literal["pydantic", "dict", "json"] = "pydantic",
) -> "CollectionsResponse | dict[str, Any] | str":
    """
    Retrieves a paginated list of collections from DorsalHub.

    This is a high-level wrapper around the DorsalClient's list_collections
    method, providing a simple way to access collection information.

    Example:
        ```python
        from dorsal.api.collection import list_dorsal_collections

        # Get the first page of collections as Pydantic objects
        response = list_dorsal_collections()
        print(f"Found {response.pagination.record_count} total collections.")
        for collection in response.results:
            print(f"- {collection.name} (ID: {collection.id})")

        # Get the second page as a JSON string
        response_json = list_dorsal_collections(page=2, mode="json")
        print(response_json)
        ```

    Args:
        page (int): The page number for pagination. Defaults to 1.
        per_page (int): The number of collections per page. Defaults to 25.
        api_key (str, optional): An API key for this request, overriding the
            client's default. Defaults to None.
        mode (Literal["pydantic", "dict", "json"]): The desired return format.
            Defaults to "pydantic".

    Returns:
        Union[CollectionsResponse, dict, str]: The search results, formatted
            according to the specified `mode`.

    Raises:
        DorsalClientError: For client-side validation errors or API errors
            like authentication or rate limiting.
        DorsalError: For other unexpected library errors.
    """
    from dorsal.session import get_shared_dorsal_client
    from dorsal.client import DorsalClient

    effective_client: DorsalClient
    log_message_context = ""
    if api_key:
        log_message_context = "using temporary client with provided API key"
        logger.debug("API key override provided for list_collections. Creating temporary DorsalClient.")
        effective_client = DorsalClient(api_key=api_key)
    else:
        log_message_context = "using shared client"
        logger.debug("No API key override for list_collections. Using shared client instance.")
        effective_client = get_shared_dorsal_client()

    logger.debug(
        "Dispatching list_collections to client (%s) with params: page=%d, per_page=%d",
        log_message_context,
        page,
        per_page,
    )

    try:
        response = effective_client.list_collections(
            page=page,
            per_page=per_page,
        )

        logger.debug(
            "Collection list successful. Returned page %d of %d, with %d records.",
            response.pagination.current_page,
            response.pagination.page_count,
            len(response.records),
        )

        if mode == "pydantic":
            return response
        if mode == "dict":
            return response.model_dump(mode="json", by_alias=True, exclude_none=True)
        if mode == "json":
            return response.model_dump_json(indent=2, by_alias=True, exclude_none=True)

        raise ValueError(f"Invalid mode: '{mode}'. Must be one of 'pydantic', 'dict', or 'json'.")

    except DorsalError as err:
        logger.warning(
            "A client error occurred during list_dorsal_collections (%s): %s - %s",
            log_message_context,
            type(err).__name__,
            err,
        )
        raise
    except Exception as err:
        logger.exception(
            "An unexpected error occurred during list_dorsal_collections (%s).",
            log_message_context,
        )
        raise DorsalError(f"An unexpected error occurred while listing collections: {err}") from err

make_collection_private

make_collection_private(collection_id, api_key=None)

Makes a public collection private.

This is a high-level wrapper that handles client instantiation and error logging.

Parameters:

Name Type Description Default
collection_id str

The unique ID of the collection to make private.

required
api_key str

An API key for this request.

None

Returns:

Name Type Description
CollectionWebLocationResponse CollectionWebLocationResponse

An object containing the new private web URL.

Raises:

Type Description
DorsalClientError

For API errors (e.g., ConflictError if already private).

DorsalError

For other unexpected library errors.

Source code in venv/lib/python3.13/site-packages/dorsal/api/collection.py
def make_collection_private(
    collection_id: str,
    api_key: str | None = None,
) -> "CollectionWebLocationResponse":
    """
    Makes a public collection private.

    This is a high-level wrapper that handles client instantiation and error logging.

    Args:
        collection_id (str): The unique ID of the collection to make private.
        api_key (str, optional): An API key for this request.

    Returns:
        CollectionWebLocationResponse: An object containing the new private web URL.

    Raises:
        DorsalClientError: For API errors (e.g., ConflictError if already private).
        DorsalError: For other unexpected library errors.
    """
    from dorsal.session import get_shared_dorsal_client
    from dorsal.client import DorsalClient

    effective_client: DorsalClient
    log_message_context = "using shared client"
    if api_key:
        log_message_context = "using temporary client with provided API key"
        effective_client = DorsalClient(api_key=api_key)
    else:
        effective_client = get_shared_dorsal_client()

    logger.debug(
        "Dispatching make_collection_private to client (%s) for ID: '%s'",
        log_message_context,
        collection_id,
    )

    try:
        response = effective_client.make_collection_private(collection_id=collection_id)
        logger.info("Successfully made collection '%s' private.", collection_id)
        return response

    except DorsalError as err:
        logger.warning(
            "A client error occurred during make_collection_private (%s): %s",
            log_message_context,
            err,
        )
        raise
    except Exception as err:
        logger.exception(
            "An unexpected error occurred during make_collection_private (%s).",
            log_message_context,
        )
        raise DorsalError(
            f"An unexpected error occurred while making collection '{collection_id}' private: {err}"
        ) from err

make_collection_public

make_collection_public(collection_id, api_key=None)

Makes a private collection public.

This is a high-level wrapper that handles client instantiation and error logging.

Parameters:

Name Type Description Default
collection_id str

The unique ID of the collection to make public.

required
api_key str

An API key for this request.

None

Returns:

Name Type Description
CollectionWebLocationResponse CollectionWebLocationResponse

An object containing the new public web URL.

Raises:

Type Description
DorsalClientError

For API errors (e.g., ConflictError if already public).

DorsalError

For other unexpected library errors.

Source code in venv/lib/python3.13/site-packages/dorsal/api/collection.py
def make_collection_public(
    collection_id: str,
    api_key: str | None = None,
) -> "CollectionWebLocationResponse":
    """
    Makes a private collection public.

    This is a high-level wrapper that handles client instantiation and error logging.

    Args:
        collection_id (str): The unique ID of the collection to make public.
        api_key (str, optional): An API key for this request.

    Returns:
        CollectionWebLocationResponse: An object containing the new public web URL.

    Raises:
        DorsalClientError: For API errors (e.g., ConflictError if already public).
        DorsalError: For other unexpected library errors.
    """
    from dorsal.session import get_shared_dorsal_client
    from dorsal.client import DorsalClient

    effective_client: DorsalClient
    log_message_context = "using shared client"
    if api_key:
        log_message_context = "using temporary client with provided API key"
        effective_client = DorsalClient(api_key=api_key)
    else:
        effective_client = get_shared_dorsal_client()

    logger.debug(
        "Dispatching make_collection_public to client (%s) for ID: '%s'",
        log_message_context,
        collection_id,
    )

    try:
        response = effective_client.make_collection_public(collection_id=collection_id)
        logger.info("Successfully made collection '%s' public.", collection_id)
        return response

    except DorsalError as err:
        logger.warning(
            "A client error occurred during make_collection_public (%s): %s",
            log_message_context,
            err,
        )
        raise
    except Exception as err:
        logger.exception(
            "An unexpected error occurred during make_collection_public (%s).",
            log_message_context,
        )
        raise DorsalError(
            f"An unexpected error occurred while making collection '{collection_id}' public: {err}"
        ) from err

make_schema_validator

make_schema_validator(
    dataset_id, api_key=None, client=None
)

Fetches a dataset's schema and returns a callable validator function.

This is useful for validating records client-side before attempting to insert them, which can save API calls.

Example
from dorsal.api import make_schema_validator

validator = make_schema_validator("my-org/my-book-collection")

good_record = {"author": "J.R.R. Tolkien", "title": "The Hobbit"}
bad_record = {"author": "J.R.R. Tolkien"} # Missing title

try:
    validator(good_record)
    print("Good record is valid.")
    validator(bad_record)
except Exception as e:
    print(f"Bad record is invalid: {e}")

Parameters:

Name Type Description Default
dataset_id str

Identifier for the dataset whose schema will be used.

required
api_key str

An API key for this request.

None

Returns:

Name Type Description
JsonSchemaValidator JsonSchemaValidator

A callable instance that validates a dictionary record against the fetched schema.

Raises:

Type Description
ValueError

If dataset_id is invalid (propagated from get_dataset).

DorsalClientError

Base client error or for unexpected issues (propagated from get_dataset).

AuthError

Authentication failure (propagated from get_dataset).

NotFoundError

If the dataset is not found (propagated from get_dataset).

ForbiddenError

Access to the dataset is denied (propagated from get_dataset).

RateLimitError

If the request is rate-limited by the API (propagated from get_dataset).

NetworkError

If a network issue occurs (propagated from get_dataset).

APIError

For other HTTP errors from the API (propagated from get_dataset).

ApiDataValidationError

If the API response for the dataset is malformed and cannot be parsed into a valid Dataset object (propagated from get_dataset).

JSONSchemaSchemaError

If the schema is invalid.

Source code in venv/lib/python3.13/site-packages/dorsal/api/dataset.py
def make_schema_validator(
    dataset_id: str, api_key: str | None = None, client: DorsalClient | None = None
) -> JsonSchemaValidator:
    """Fetches a dataset's schema and returns a callable validator function.

    This is useful for validating records client-side before attempting to
    insert them, which can save API calls.

    Example:
        ```python
        from dorsal.api import make_schema_validator

        validator = make_schema_validator("my-org/my-book-collection")

        good_record = {"author": "J.R.R. Tolkien", "title": "The Hobbit"}
        bad_record = {"author": "J.R.R. Tolkien"} # Missing title

        try:
            validator(good_record)
            print("Good record is valid.")
            validator(bad_record)
        except Exception as e:
            print(f"Bad record is invalid: {e}")

        ```

    Args:
        dataset_id (str): Identifier for the dataset whose schema will be used.
        api_key (str, optional): An API key for this request.

    Returns:
        JsonSchemaValidator: A callable instance that validates a dictionary
            record against the fetched schema.

    Raises:
        ValueError: If `dataset_id` is invalid (propagated from `get_dataset`).
        DorsalClientError: Base client error or for unexpected issues (propagated
                           from `get_dataset`).
        AuthError: Authentication failure (propagated from `get_dataset`).
        NotFoundError: If the dataset is not found (propagated from `get_dataset`).
        ForbiddenError: Access to the dataset is denied (propagated from `get_dataset`).
        RateLimitError: If the request is rate-limited by the API (propagated from
                        `get_dataset`).
        NetworkError: If a network issue occurs (propagated from `get_dataset`).
        APIError: For other HTTP errors from the API (propagated from `get_dataset`).
        ApiDataValidationError: If the API response for the dataset is malformed and
                                cannot be parsed into a valid `Dataset` object
                                (propagated from `get_dataset`).
        JSONSchemaSchemaError: If the schema is invalid.

    """
    if dataset_id.startswith("open/"):
        schema_name = dataset_id.removeprefix("open/")
        try:
            return get_open_schema_validator(cast(Any, schema_name))
        except (ValueError, TypeError):
            pass

    if is_offline_mode():
        raise DorsalOfflineError(
            f"Cannot fetch validator for '{dataset_id}': System is in OFFLINE mode and this schema is locally available."
        )
    schema = get_dataset_schema(dataset_id=dataset_id, api_key=api_key, client=client)

    return get_json_schema_validator(schema=schema)

register_model

register_model(
    annotation_model,
    schema_id,
    validation_model=None,
    dependencies=None,
    options=None,
    overwrite=False,
    *,
    scope="project"
)

Programmatically registers a new annotation model in the dorsal config.

Source code in venv/lib/python3.13/site-packages/dorsal/api/config.py
def register_model(
    annotation_model: Type[AnnotationModel],
    schema_id: str,
    validation_model: dict | Type[Any] | JsonSchemaValidator | None = None,
    dependencies: list[ModelRunnerDependencyConfig] | ModelRunnerDependencyConfig | None = None,
    options: dict | None = None,
    overwrite: bool = False,
    *,
    scope: Literal["project", "global"] = "project",
) -> None:
    """
    Programmatically registers a new annotation model in the dorsal config.
    """
    from dorsal.common.model import is_pydantic_model_class, is_pydantic_model_instance
    from dorsal.common.validators.json_schema import (
        JsonSchemaValidator,
        JSON_SCHEMA_LIVENESS_KEYWORDS,
    )
    from dorsal.common.exceptions import DorsalConfigError, PydanticValidationError

    if scope not in ["project", "global"]:
        raise ValueError("Invalid scope. Must be one of 'project' or 'global'.")

    effective_dependencies_dicts = []
    if dependencies:
        dep_list = dependencies
        if not isinstance(dep_list, list):
            dep_list = [dep_list]
        for i, dep in enumerate(dep_list):
            d_any = cast(Any, dep)
            if is_pydantic_model_instance(d_any):
                effective_dependencies_dicts.append(d_any.model_dump())
            elif isinstance(d_any, dict):
                raise TypeError(
                    f"Item {i} in 'dependencies' is a dict. "
                    "Dependencies must be passed as instances of a "
                    "'ModelRunnerDependencyConfig' subclass (e.g., MediaTypeDependencyConfig)."
                )
            else:
                raise TypeError(
                    f"Item {i} in 'dependencies' is an invalid type ({type(d_any)}). "
                    "Must be an instance of 'ModelRunnerDependencyConfig'."
                )

    model_module, model_name = annotation_model.__module__, annotation_model.__name__
    if model_module == "__main__":
        raise TypeError(f"Model '{model_name}' must be defined in an importable module, not the main script.")
    model_path = (model_module, model_name)

    validation_model_config: tuple[str, str] | dict[str, Any] | None = None
    is_open_schema = schema_id.startswith("open/")

    if is_open_schema:
        schema_name = schema_id.removeprefix("open/")
        validator_path = (
            "dorsal.file.validators.open_schema",
            f"{schema_name}_validator",
        )
        if validation_model is not None:
            raise ValueError(
                f"Ambiguous configuration: You cannot provide a custom 'validation_model' when using an 'open/' schema_id ('{schema_id}')."
            )
        validation_model_config = validator_path

    elif validation_model is not None:
        if isinstance(validation_model, dict):
            if not any(key in validation_model for key in JSON_SCHEMA_LIVENESS_KEYWORDS):
                raise ValueError("The provided 'validation_model' schema dict appears to be inert.")
            validation_model_config = validation_model
        elif is_pydantic_model_class(validation_model):
            validator_module, validator_name = (
                validation_model.__module__,
                validation_model.__name__,
            )
            if validator_module == "__main__":
                raise TypeError(f"Validator class '{validator_name}' must be defined in an importable module.")
            validation_model_config = (validator_module, validator_name)
        elif isinstance(validation_model, JsonSchemaValidator):
            validator_module, validator_name = (
                validation_model.__module__,
                validation_model.__name__,
            )
            if validator_module == "__main__":
                raise TypeError(f"Validator instance '{validator_name}' must be defined in an importable module.")
            validation_model_config = (validator_module, validator_name)
        else:
            raise TypeError(f"Invalid 'validation_model' type ({type(validation_model)}).")

    new_step_data = {
        k: v
        for k, v in {
            "annotation_model": model_path,
            "schema_id": schema_id,
            "dependencies": effective_dependencies_dicts if effective_dependencies_dicts else None,
            "validation_model": validation_model_config,
            "options": options,
        }.items()
        if v is not None
    }

    try:
        validated_step_model = ModelRunnerPipelineStep.model_validate(new_step_data)
        toml_safe_step_data = validated_step_model.model_dump(mode="json", exclude_none=True)
    except PydanticValidationError as e:
        raise DorsalConfigError(f"The provided model configuration is invalid: {e}") from e

    try:
        PipelineConfig.upsert_step(step_data=toml_safe_step_data, overwrite=overwrite, scope=scope)
    except Exception as e:
        raise DorsalConfigError(f"Failed to register model in {scope} config: {e}") from e

remove_files_from_collection

remove_files_from_collection(
    collection_id, hashes, api_key=None
)

Removes a list of files from a remote collection by their hash.

This function automatically handles batching for large lists of hashes to comply with the API limit (10,000 per request).

Parameters:

Name Type Description Default
collection_id str

The unique ID of the collection to modify.

required
hashes list[str]

A list of SHA-256 file hashes to remove.

required
api_key str

An API key for this request.

None

Returns:

Name Type Description
RemoveFilesResponse RemoveFilesResponse

A consolidated response summarizing the results.

Raises:

Type Description
DorsalClientError

For API errors.

DorsalError

For other unexpected library errors.

Source code in venv/lib/python3.13/site-packages/dorsal/api/collection.py
def remove_files_from_collection(
    collection_id: str,
    hashes: list[str],
    api_key: str | None = None,
) -> "RemoveFilesResponse":
    """
    Removes a list of files from a remote collection by their hash.

    This function automatically handles batching for large lists of hashes
    to comply with the API limit (10,000 per request).

    Args:
        collection_id (str): The unique ID of the collection to modify.
        hashes (list[str]): A list of SHA-256 file hashes to remove.
        api_key (str, optional): An API key for this request.

    Returns:
        RemoveFilesResponse: A consolidated response summarizing the results.

    Raises:
        DorsalClientError: For API errors.
        DorsalError: For other unexpected library errors.
    """
    from dorsal.session import get_shared_dorsal_client
    from dorsal.client import DorsalClient
    from dorsal.client.validators import RemoveFilesResponse

    if not hashes:
        raise ValueError("The 'hashes' list cannot be empty.")

    effective_client: DorsalClient
    log_message_context = "using shared client"
    if api_key:
        log_message_context = "using temporary client with provided API key"
        effective_client = DorsalClient(api_key=api_key)
    else:
        effective_client = get_shared_dorsal_client()

    logger.debug(
        "Dispatching remove_files_from_collection to client (%s) for ID: '%s' with %d hashes.",
        log_message_context,
        collection_id,
        len(hashes),
    )

    total_removed = 0
    total_not_found = 0
    batches = [hashes[i : i + API_MAX_BATCH_SIZE] for i in range(0, len(hashes), API_MAX_BATCH_SIZE)]

    try:
        for i, batch in enumerate(batches):
            logger.debug(f"Processing batch {i + 1}/{len(batches)}...")
            response = effective_client.remove_files_from_collection(collection_id=collection_id, hashes=batch)
            total_removed += response.removed_count
            total_not_found += response.not_found_count

        aggregate_response = RemoveFilesResponse(removed_count=total_removed, not_found_count=total_not_found)
        logger.info(
            "Successfully finished removing files from collection '%s'. Removed: %d",
            collection_id,
            aggregate_response.removed_count,
        )
        return aggregate_response

    except DorsalError as err:
        logger.warning(
            "A client error occurred during remove_files_from_collection (%s): %s",
            log_message_context,
            err,
        )
        raise
    except Exception as err:
        logger.exception(
            "An unexpected error occurred during remove_files_from_collection (%s).",
            log_message_context,
        )
        raise DorsalError(
            f"An unexpected error occurred while removing files from collection '{collection_id}': {err}"
        ) from err

remove_model_by_name

remove_model_by_name(name, scope='project')

Removes a model from the pipeline by its name (e.g., "PDFAnnotationModel"). Raises an error if the name is ambiguous (duplicates exist).

Source code in venv/lib/python3.13/site-packages/dorsal/api/config.py
def remove_model_by_name(name: str, scope: Literal["project", "global"] = "project") -> None:
    """
    Removes a model from the pipeline by its name (e.g., "PDFAnnotationModel").
    Raises an error if the name is ambiguous (duplicates exist).
    """
    PipelineConfig.remove_step_by_name(name=name, scope=scope)

remove_tag_from_file

remove_tag_from_file(hash_string, tag_id, api_key=None)

Removes a specific tag from a file record.

Parameters:

Name Type Description Default
hash_string str

The hash of the file record.

required
tag_id str

The unique ID of the tag to remove.

required
api_key str

An API key for this request.

None
Source code in venv/lib/python3.13/site-packages/dorsal/api/file.py
def remove_tag_from_file(hash_string: str, tag_id: str, api_key: str | None = None) -> None:
    """
    Removes a specific tag from a file record.

    Args:
        hash_string (str): The hash of the file record.
        tag_id (str): The unique ID of the tag to remove.
        api_key (str, optional): An API key for this request.
    """
    from dorsal.session import get_shared_dorsal_client

    effective_client = get_shared_dorsal_client()
    if api_key:
        from dorsal.client import DorsalClient

        effective_client = DorsalClient(api_key=api_key)

    try:
        effective_client.delete_tag(file_hash=hash_string, tag_id=tag_id)
        return None
    except DorsalClientError:
        raise

scan_directory

scan_directory(
    dir_path,
    recursive=False,
    *,
    api_key=None,
    use_cache=True,
    offline=False
)

Scans a directory and returns a list of LocalFile objects.

This function is a high-level wrapper that processes all files in a given directory and generates their metadata offline using the local Annotation Model pipeline.

It returns a list of LocalFile objects.

Example
from dorsal.api import scan_directory

# Scan a directory non-recursively for all files
processed_files = scan_directory("path/to/my_invoices")

print(f"Found {len(processed_files)} files to process.")

# You can now iterate over the list
for f in processed_files:
    if f.size > 500000:
        print(f"{f.name} is a large file.")

Parameters:

Name Type Description Default
dir_path str

The path to the directory you want to scan.

required
recursive bool

If True, scans all subdirectories recursively. Defaults to False.

False

Returns:

Type Description
list[LocalFile]

list[LocalFile]: A list of processed LocalFile objects from the directory.

Source code in venv/lib/python3.13/site-packages/dorsal/api/file.py
def scan_directory(
    dir_path: str, recursive: bool = False, *, api_key: str | None = None, use_cache: bool = True, offline: bool = False
) -> list[LocalFile]:
    """Scans a directory and returns a list of LocalFile objects.

    This function is a high-level wrapper that processes all files in a
    given directory and generates their metadata offline using the local Annotation Model pipeline.

    It returns a list of `LocalFile` objects.

    Example:
        ```python
        from dorsal.api import scan_directory

        # Scan a directory non-recursively for all files
        processed_files = scan_directory("path/to/my_invoices")

        print(f"Found {len(processed_files)} files to process.")

        # You can now iterate over the list
        for f in processed_files:
            if f.size > 500000:
                print(f"{f.name} is a large file.")
        ```

    Args:
        dir_path (str): The path to the directory you want to scan.
        recursive (bool, optional): If True, scans all subdirectories
            recursively. Defaults to False.

    Returns:
        list[LocalFile]: A list of processed `LocalFile` objects from the directory.
    """
    from dorsal.file.metadata_reader import MetadataReader

    effective_reader: MetadataReader
    if api_key is not None or offline:
        if api_key:
            logger.debug(
                "API key override for scan_directory (path: '%s'). Attaching MetadataReader configured with this API Key",
                dir_path,
            )
        if offline:
            logger.debug(
                "Offline mode. Attached MetadataReader blocked from making network calls",
            )
        effective_reader = MetadataReader(api_key=api_key)
    else:
        logger.debug(
            "No API key override for scan_directory (file: '%s'). Using shared METADATA_READER.",
            dir_path,
        )
        effective_reader = get_metadata_reader()

    logger.debug(
        "High-level scan_directory calling effective MetadataReader for dir_path='%s', recursive=%s.",
        dir_path,
        recursive,
    )

    try:
        local_files = effective_reader.scan_directory(dir_path=dir_path, recursive=recursive, skip_cache=not use_cache)
        logger.debug(
            "Effective MetadataReader.scan_directory completed for dir_path='%s'. Found %d LocalFile objects.",
            dir_path,
            len(local_files),
        )
        return local_files
    except (FileNotFoundError, DorsalError) as err:
        logger.warning(
            "Call to effective MetadataReader.scan_directory for dir_path='%s' failed: %s - %s",
            dir_path,
            type(err).__name__,
            err,
        )
        raise
    except Exception as err:
        logger.exception(
            "Unexpected error in high-level scan_directory for dir_path='%s'",
            dir_path,
        )
        if isinstance(err, DorsalError):
            raise
        raise DorsalError(f"An unexpected error occurred while reading directory '{dir_path}'.") from err

scan_file

scan_file(
    file_path,
    *,
    api_key=None,
    use_cache=True,
    offline=False
)

Processes a single file and returns a LocalFile object.

This is a direct wrapper for dorsal.LocalFile. It's a convenient entry point for processing a single file and accessing its metadata without needing to import the LocalFile class directly.

Example
from dorsal.api import scan_file

local_file = scan_file("path/to/my_image.jpg")

print(f"File: {local_file.name}")
print(f"Media Type: {local_file.media_type}")

Parameters:

Name Type Description Default
file_path str

The path to the local file to process.

required

Returns:

Name Type Description
LocalFile LocalFile

An initialized LocalFile instance with extracted metadata."

Source code in venv/lib/python3.13/site-packages/dorsal/api/file.py
def scan_file(
    file_path: str, *, api_key: str | None = None, use_cache: bool = True, offline: bool = False
) -> LocalFile:
    """Processes a single file and returns a LocalFile object.

    This is a direct wrapper for `dorsal.LocalFile`. It's a convenient
    entry point for processing a single file and accessing its metadata
    without needing to import the `LocalFile` class directly.

    Example:
        ```python
        from dorsal.api import scan_file

        local_file = scan_file("path/to/my_image.jpg")

        print(f"File: {local_file.name}")
        print(f"Media Type: {local_file.media_type}")
        ```

    Args:
        file_path (str): The path to the local file to process.

    Returns:
        LocalFile: An initialized `LocalFile` instance with extracted metadata."
    """
    from dorsal.file.metadata_reader import MetadataReader

    effective_reader: MetadataReader

    if api_key is not None or offline:
        if api_key:
            logger.debug(
                "API key override for scan_file (file: '%s'). Attaching MetadataReader configured with this API Key",
                file_path,
            )
        if offline:
            logger.debug(
                "Offline mode. Attached MetadataReader blocked from making network calls",
            )
        effective_reader = MetadataReader(api_key=api_key)
    else:
        logger.debug(
            "No API key override for scan_file (file: '%s'). Using shared METADATA_READER.",
            file_path,
        )
        effective_reader = get_metadata_reader()

    logger.debug("High-level scan_file calling effective MetadataReader for file_path='%s'.", file_path)

    try:
        local_file = effective_reader.scan_file(file_path=file_path, skip_cache=not use_cache)
        logger.debug(
            "Effective MetadataReader.scan_file completed for file_path='%s'. Hash: %s",
            file_path,
            local_file.hash if hasattr(local_file, "hash") else "N/A",
        )
        return local_file
    except (FileNotFoundError, IOError, DorsalError) as err:
        logger.warning(
            "Call to effective MetadataReader.scan_file for file_path='%s' failed: %s - %s",
            file_path,
            type(err).__name__,
            err,
        )
        raise
    except Exception as err:
        logger.exception(
            "Unexpected error in high-level scan_file for file_path='%s'",
            file_path,
        )
        if isinstance(err, DorsalError):
            raise
        raise DorsalError(f"An unexpected error occurred while reading file '{file_path}'.") from err

search_global_files

search_global_files(
    query: str,
    page: int = 1,
    per_page: int = 25,
    sort_by: Literal[
        "date_modified", "date_created", "size", "name"
    ] = "date_modified",
    sort_order: Literal["asc", "desc"] = "desc",
    api_key: str | None = None,
    *,
    match_any: bool = False,
    mode: Literal["pydantic"],
    model_dump_exclude_none: bool = True,
    model_dump_by_alias: bool = True
) -> "FileSearchResponse"
search_global_files(
    query: str,
    page: int = 1,
    per_page: int = 25,
    sort_by: Literal[
        "date_modified", "date_created", "size", "name"
    ] = "date_modified",
    sort_order: Literal["asc", "desc"] = "desc",
    api_key: str | None = None,
    *,
    match_any: bool = False,
    mode: Literal["dict"],
    model_dump_exclude_none: bool = True,
    model_dump_by_alias: bool = True
) -> dict[str, Any]
search_global_files(
    query: str,
    page: int = 1,
    per_page: int = 25,
    sort_by: Literal[
        "date_modified", "date_created", "size", "name"
    ] = "date_modified",
    sort_order: Literal["asc", "desc"] = "desc",
    api_key: str | None = None,
    *,
    match_any: bool = False,
    mode: Literal["json"],
    model_dump_exclude_none: bool = True,
    model_dump_by_alias: bool = True
) -> str
search_global_files(
    query,
    page=1,
    per_page=25,
    sort_by="date_modified",
    sort_order="desc",
    api_key=None,
    *,
    match_any=False,
    mode="pydantic",
    model_dump_exclude_none=True,
    model_dump_by_alias=True
)

Searches for public file records across the entire DorsalHub platform.

This function provides a simple interface to search all public files. Note: This is a premium feature and requires an appropriate account status.

Example
from dorsal.api import search_global_files

# Find all publicly indexed files tagged with 'research'
try:
    response = search_global_files(
        query="tag:research",
        mode="dict"
    )

    print(f"Found {response['pagination']['record_count']} public files tagged 'research'.")
    for record in response['results']:
        print(f"- {record['name']} (hash: {record['hash']})")

except Exception as e:
    print(f"An error occurred during search: {e}")

Parameters:

Name Type Description Default
query str

The search query string. Supports operators like tag:, name:, extension:, and size:>1MB.

required
mode Literal['pydantic', 'dict', 'json']

The desired return format. Defaults to "pydantic".

'pydantic'
page int

The page number for pagination. Defaults to 1.

1
per_page int

The number of results per page. Must be between 1 and 50. Defaults to 25.

25
sort_by Literal

The field to sort results by. Defaults to 'date_modified'.

'date_modified'
sort_order Literal

The sort order ('asc' or 'desc'). Defaults to 'desc'.

'desc'
api_key str | None

An API key for this request, overriding the client's default. Defaults to None.

None

Returns:

Type Description
'FileSearchResponse | dict | str'

Union[FileSearchResponse, dict, str]: The search results, formatted according to the specified mode.

Raises:

Type Description
DorsalClientError

For client-side validation errors or API errors like authentication, rate limiting, or insufficient permissions (e.g. using this feature on a non-premium account).

DorsalError

For other unexpected library errors.

Source code in venv/lib/python3.13/site-packages/dorsal/api/file.py
def search_global_files(
    query: str,
    page: int = 1,
    per_page: int = 25,
    sort_by: Literal["date_modified", "date_created", "size", "name"] = "date_modified",
    sort_order: Literal["asc", "desc"] = "desc",
    api_key: str | None = None,
    *,
    match_any: bool = False,
    mode: Literal["pydantic", "dict", "json"] = "pydantic",
    model_dump_exclude_none: bool = True,
    model_dump_by_alias: bool = True,
) -> "FileSearchResponse | dict | str":
    """Searches for public file records across the entire DorsalHub platform.

    This function provides a simple interface to search all public files.
    Note: This is a premium feature and requires an appropriate account status.

    Example:
        ```python
        from dorsal.api import search_global_files

        # Find all publicly indexed files tagged with 'research'
        try:
            response = search_global_files(
                query="tag:research",
                mode="dict"
            )

            print(f"Found {response['pagination']['record_count']} public files tagged 'research'.")
            for record in response['results']:
                print(f"- {record['name']} (hash: {record['hash']})")

        except Exception as e:
            print(f"An error occurred during search: {e}")
        ```

    Args:
        query (str): The search query string. Supports operators like
            `tag:`, `name:`, `extension:`, and `size:>1MB`.
        mode (Literal["pydantic", "dict", "json"]): The desired return format.
            Defaults to "pydantic".
        page (int): The page number for pagination. Defaults to 1.
        per_page (int): The number of results per page. Must be between 1 and 50.
            Defaults to 25.
        sort_by (Literal): The field to sort results by. Defaults to 'date_modified'.
        sort_order (Literal): The sort order ('asc' or 'desc'). Defaults to 'desc'.
        api_key (str | None): An API key for this request, overriding the
            client's default. Defaults to None.

    Returns:
        Union[FileSearchResponse, dict, str]: The search results, formatted
            according to the specified `mode`.

    Raises:
        DorsalClientError: For client-side validation errors or API errors like
            authentication, rate limiting, or insufficient permissions (e.g.
            using this feature on a non-premium account).
        DorsalError: For other unexpected library errors.
    """
    from dorsal.client import DorsalClient

    effective_client: DorsalClient
    log_message_context = ""
    if api_key:
        log_message_context = "using temporary client with provided API key"
        logger.debug("API key override provided for global search. Creating temporary DorsalClient.")
        effective_client = DorsalClient(api_key=api_key)
    else:
        log_message_context = "using shared client via MetadataReader"
        logger.debug("No API key override for global search. Using shared client instance.")
        effective_client = get_metadata_reader()._client

    logger.debug(
        "Dispatching global file search to client (%s) with query: '%s'",
        log_message_context,
        query,
    )

    try:
        response = effective_client.search_files(
            q=query,
            scope="global",
            page=page,
            per_page=per_page,
            sort_by=sort_by,
            sort_order=sort_order,
            match_any=match_any,
        )

        logger.debug(
            "Global file search successful. Returned page %d of %d, with %d records.",
            response.pagination.current_page,
            response.pagination.page_count,
            len(response.results),
        )

        if mode == "pydantic":
            return response
        if mode == "dict":
            return response.model_dump(
                by_alias=model_dump_by_alias,
                exclude_none=model_dump_exclude_none,
                mode="json",
            )
        if mode == "json":
            return response.model_dump_json(
                by_alias=model_dump_by_alias,
                exclude_none=model_dump_exclude_none,
                indent=2,
            )
        raise ValueError(f"Invalid mode: '{mode}'. Must be one of 'pydantic', 'dict', or 'json'.")  # pragma: no cover

    except DorsalError as err:
        logger.warning(
            "A client error occurred during search_global_files (%s): %s - %s",
            log_message_context,
            type(err).__name__,
            err,
        )
        raise
    except Exception as err:
        logger.exception(
            "An unexpected error occurred during search_global_files (%s).",
            log_message_context,
        )
        raise DorsalError(f"An unexpected error occurred during global file search: {err}") from err

search_user_files

search_user_files(
    query: str,
    deduplicate: bool = True,
    page: int = 1,
    per_page: int = 25,
    sort_by: Literal[
        "date_modified", "date_created", "size", "name"
    ] = "date_modified",
    sort_order: Literal["asc", "desc"] = "desc",
    *,
    match_any: bool = False,
    api_key: str | None = None,
    mode: Literal["pydantic"],
    model_dump_exclude_none: bool = True,
    model_dump_by_alias: bool = True
) -> "FileSearchResponse"
search_user_files(
    query: str,
    deduplicate: bool = True,
    page: int = 1,
    per_page: int = 25,
    sort_by: Literal[
        "date_modified", "date_created", "size", "name"
    ] = "date_modified",
    sort_order: Literal["asc", "desc"] = "desc",
    *,
    match_any: bool = False,
    api_key: str | None = None,
    mode: Literal["dict"],
    model_dump_exclude_none: bool = True,
    model_dump_by_alias: bool = True
) -> dict[str, Any]
search_user_files(
    query: str,
    deduplicate: bool = True,
    page: int = 1,
    per_page: int = 25,
    sort_by: Literal[
        "date_modified", "date_created", "size", "name"
    ] = "date_modified",
    sort_order: Literal["asc", "desc"] = "desc",
    *,
    match_any: bool = False,
    api_key: str | None = None,
    mode: Literal["json"],
    model_dump_exclude_none: bool = True,
    model_dump_by_alias: bool = True
) -> str
search_user_files(
    query,
    deduplicate=True,
    page=1,
    per_page=25,
    sort_by="date_modified",
    sort_order="desc",
    *,
    match_any=False,
    api_key=None,
    mode="pydantic",
    model_dump_exclude_none=True,
    model_dump_by_alias=True
)

Searches for file records indexed by the authenticated user.

This function provides a simple and powerful interface to search for files you have indexed on DorsalHub. The query supports simple text matching as well as advanced operators.

Example
from dorsal.api import search_user_files

# Find all PDF files you have indexed, sorted by name
try:
    response = search_user_files(
        query="extension:pdf",
        sort_by="name",
        sort_order="asc",
        mode="dict"
    )

    print(f"Found {response['pagination']['record_count']} matching PDF files.")
    for record in response['results']:
        print(f"- {record['name']}")

except Exception as e:
    print(f"An error occurred during search: {e}")

Parameters:

Name Type Description Default
query str

The search query string. Supports operators like tag:, name:, extension:, and size:>1MB.

required
mode Literal['pydantic', 'dict', 'json']

The desired return format. Defaults to "pydantic".

'pydantic'
deduplicate bool

If True, returns only unique file records based on their content hash. Defaults to True.

True
page int

The page number for pagination. Defaults to 1.

1
per_page int

The number of results per page. Must be between 1 and 50. Defaults to 25.

25
sort_by Literal

The field to sort results by. Defaults to 'date_modified'.

'date_modified'
sort_order Literal

The sort order ('asc' or 'desc'). Defaults to 'desc'.

'desc'
api_key str | None

An API key for this request, overriding the client's default. Defaults to None.

None

Returns:

Type Description
'FileSearchResponse | dict | str'

Union[FileSearchResponse, dict, str]: The search results, formatted according to the specified mode.

Raises:

Type Description
DorsalClientError

For client-side validation errors or API errors like authentication or rate limiting.

DorsalError

For other unexpected library errors.

Source code in venv/lib/python3.13/site-packages/dorsal/api/file.py
def search_user_files(
    query: str,
    deduplicate: bool = True,
    page: int = 1,
    per_page: int = 25,
    sort_by: Literal["date_modified", "date_created", "size", "name"] = "date_modified",
    sort_order: Literal["asc", "desc"] = "desc",
    *,
    match_any: bool = False,
    api_key: str | None = None,
    mode: Literal["pydantic", "dict", "json"] = "pydantic",
    model_dump_exclude_none: bool = True,
    model_dump_by_alias: bool = True,
) -> "FileSearchResponse | dict | str":
    """Searches for file records indexed by the authenticated user.

    This function provides a simple and powerful interface to search for files
    you have indexed on DorsalHub. The query supports simple text matching as
    well as advanced operators.

    Example:
        ```python
        from dorsal.api import search_user_files

        # Find all PDF files you have indexed, sorted by name
        try:
            response = search_user_files(
                query="extension:pdf",
                sort_by="name",
                sort_order="asc",
                mode="dict"
            )

            print(f"Found {response['pagination']['record_count']} matching PDF files.")
            for record in response['results']:
                print(f"- {record['name']}")

        except Exception as e:
            print(f"An error occurred during search: {e}")
        ```

    Args:
        query (str): The search query string. Supports operators like
            `tag:`, `name:`, `extension:`, and `size:>1MB`.
        mode (Literal["pydantic", "dict", "json"]): The desired return format.
            Defaults to "pydantic".
        deduplicate (bool): If True, returns only unique file records based on
            their content hash. Defaults to True.
        page (int): The page number for pagination. Defaults to 1.
        per_page (int): The number of results per page. Must be between 1 and 50.
            Defaults to 25.
        sort_by (Literal): The field to sort results by. Defaults to 'date_modified'.
        sort_order (Literal): The sort order ('asc' or 'desc'). Defaults to 'desc'.
        api_key (str | None): An API key for this request, overriding the
            client's default. Defaults to None.

    Returns:
        Union[FileSearchResponse, dict, str]: The search results, formatted
            according to the specified `mode`.

    Raises:
        DorsalClientError: For client-side validation errors or API errors
            like authentication or rate limiting.
        DorsalError: For other unexpected library errors.
    """
    from dorsal.client import DorsalClient

    effective_client: DorsalClient
    log_message_context = ""
    if api_key:
        log_message_context = "using temporary client with provided API key"
        logger.debug("API key override provided for search. Creating temporary DorsalClient.")
        effective_client = DorsalClient(api_key=api_key)
    else:
        log_message_context = "using shared client via MetadataReader"
        logger.debug("No API key override for search. Using shared client instance.")
        effective_client = get_metadata_reader()._client

    logger.debug(
        "Dispatching user file search to client (%s) with query: '%s'",
        log_message_context,
        query,
    )

    try:
        response = effective_client.search_files(
            q=query,
            scope="user",
            deduplicate=deduplicate,
            page=page,
            per_page=per_page,
            sort_by=sort_by,
            sort_order=sort_order,
            match_any=match_any,
        )

        logger.debug(
            "User file search successful. Returned page %d of %d, with %d records.",
            response.pagination.current_page,
            response.pagination.page_count,
            len(response.results),
        )

        if mode == "pydantic":
            return response
        if mode == "dict":
            return response.model_dump(
                by_alias=model_dump_by_alias,
                exclude_none=model_dump_exclude_none,
                mode="json",
            )
        if mode == "json":
            return response.model_dump_json(
                by_alias=model_dump_by_alias,
                exclude_none=model_dump_exclude_none,
                indent=2,
            )

        raise ValueError(f"Invalid mode: '{mode}'. Must be one of 'pydantic', 'dict', or 'json'.")  # pragma: no cover

    except DorsalError as err:
        logger.warning(
            "A client error occurred during search_user_files (%s): %s - %s",
            log_message_context,
            type(err).__name__,
            err,
        )
        raise
    except Exception as err:
        logger.exception(
            "An unexpected error occurred during search_user_files (%s).",
            log_message_context,
        )
        raise DorsalError(f"An unexpected error occurred during file search: {err}") from err

show_model_pipeline

show_model_pipeline(scope='effective')

Returns a simplified, human-readable summary of the pipeline.

Source code in venv/lib/python3.13/site-packages/dorsal/api/config.py
def show_model_pipeline(scope: Literal["effective", "project", "global"] = "effective") -> list[dict[str, Any]]:
    """
    Returns a simplified, human-readable summary of the pipeline.
    """
    steps = PipelineConfig.get_steps(scope=scope)
    summary = []
    for i, step in enumerate(steps):
        deps_str = "None"
        if step.dependencies:
            deps = [d.type for d in step.dependencies]
            deps_str = ", ".join(deps)

        status = "Active"
        if step.deactivated:
            status = "Deactivated"
        if i == 0:
            status = "Base (Locked)"

        summary.append(
            {
                "index": i,
                "status": status,
                "name": step.annotation_model.name,
                "module": step.annotation_model.module,
                "schema_id": step.schema_id,
                "dependencies": deps_str,
            }
        )
    return summary

update_collection

update_collection(
    collection_id: str,
    name: str | None = None,
    description: str | None = None,
    api_key: str | None = None,
    *,
    mode: Literal["pydantic"]
) -> FileCollection
update_collection(
    collection_id: str,
    name: str | None = None,
    description: str | None = None,
    api_key: str | None = None,
    *,
    mode: Literal["dict"]
) -> dict[str, Any]
update_collection(
    collection_id: str,
    name: str | None = None,
    description: str | None = None,
    api_key: str | None = None,
    *,
    mode: Literal["json"]
) -> str
update_collection(
    collection_id,
    name=None,
    description=None,
    api_key=None,
    *,
    mode="pydantic"
)

Updates the metadata of a remote collection on DorsalHub.

Parameters:

Name Type Description Default
collection_id str

The unique ID of the collection to update.

required
name str

The new name for the collection.

None
description str

The new description for the collection.

None
api_key str

An API key for this request.

None
mode Literal['pydantic', 'dict', 'json']

The desired return format.

'pydantic'

Returns:

Type Description
FileCollection | dict[str, Any] | str

The updated collection data, formatted according to the specified mode.

Raises:

Type Description
DorsalClientError

For API errors.

DorsalError

For other unexpected library errors.

Source code in venv/lib/python3.13/site-packages/dorsal/api/collection.py
def update_collection(
    collection_id: str,
    name: str | None = None,
    description: str | None = None,
    api_key: str | None = None,
    *,
    mode: Literal["pydantic", "dict", "json"] = "pydantic",
) -> "FileCollection | dict[str, Any] | str":
    """
    Updates the metadata of a remote collection on DorsalHub.

    Args:
        collection_id (str): The unique ID of the collection to update.
        name (str, optional): The new name for the collection.
        description (str, optional): The new description for the collection.
        api_key (str, optional): An API key for this request.
        mode (Literal["pydantic", "dict", "json"]): The desired return format.

    Returns:
        The updated collection data, formatted according to the specified `mode`.

    Raises:
        DorsalClientError: For API errors.
        DorsalError: For other unexpected library errors.
    """
    from dorsal.session import get_shared_dorsal_client
    from dorsal.client import DorsalClient

    if not any([name, description]):
        raise ValueError("At least one field (name or description) must be provided to update.")

    effective_client: DorsalClient

    if api_key:
        log_message_context = "using temporary client with provided API key"
        effective_client = DorsalClient(api_key=api_key)
    else:
        log_message_context = "using shared client"
        effective_client = get_shared_dorsal_client()

    logger.debug(
        "Dispatching update_collection to client (%s) for ID: '%s'",
        log_message_context,
        collection_id,
    )

    try:
        response = effective_client.update_collection(collection_id=collection_id, name=name, description=description)

        if mode == "pydantic":
            return response
        if mode == "dict":
            return response.model_dump(mode="json", by_alias=True, exclude_none=True)
        if mode == "json":
            return response.model_dump_json(indent=2, by_alias=True, exclude_none=True)

        raise ValueError(f"Invalid mode: '{mode}'.")

    except DorsalError as err:
        logger.warning(
            "A client error occurred during update_collection (%s): %s",
            log_message_context,
            err,
        )
        raise
    except Exception as err:
        logger.exception(
            "An unexpected error occurred during update_collection (%s).",
            log_message_context,
        )
        raise DorsalError(f"An unexpected error occurred while updating collection '{collection_id}': {err}") from err

validate_dataset_records

validate_dataset_records(
    dataset_id,
    records,
    schema_dict=None,
    api_key=None,
    client=None,
)

Validates records against a dataset's JSON schema.

Orchestrates schema retrieval (if a schema is not provided directly), custom validator preparation, and record-by-record validation. Returns a summary of the validation results.

Parameters:

Name Type Description Default
dataset_id str

Identifier of the dataset. Used to fetch the schema if schema_dict is None, and for logging/error context.

required
records list[dict]

A list of dictionaries, where each dictionary is a record to be validated.

required
schema_dict dict | None

Optional. A pre-fetched JSON schema dictionary. If provided, get_dataset_schema will not be called.

None
api_key str | None

Optional API key, used by get_dataset_schema if schema_dict is not provided. Uses the client's default if None.

None

Returns:

Name Type Description
dict dict

A summary of validation results, including counts for total, valid, and invalid records, and detailed error information for each invalid record.

Raises:

Type Description
ValueError

If dataset_id is invalid, records is not a list, or if schema_dict is provided but is not a valid, non-empty dictionary.

ApiDataValidationError

If a schema (fetched or provided) is invalid or cannot be used to prepare a validator (e.g., due to JSONSchemaSchemaError during preparation).

DorsalClientError

(And its subclasses like AuthError, NotFoundError, NetworkError, APIError, etc.) Propagated if get_dataset_schema is called and encounters an issue.

JSONSchemaSchemaError

Propagated from validate_records_with_validator if the validator's schema has issues found during the record validation loop (should be rare if prepare_custom_validator succeeds).

Source code in venv/lib/python3.13/site-packages/dorsal/api/dataset.py
def validate_dataset_records(
    dataset_id: str,
    records: list[dict],
    schema_dict: dict | None = None,
    api_key: str | None = None,
    client: DorsalClient | None = None,
) -> dict:
    """Validates records against a dataset's JSON schema.

    Orchestrates schema retrieval (if a schema is not provided directly),  custom validator preparation,
        and record-by-record validation.
    Returns a summary of the validation results.

    Args:
        dataset_id: Identifier of the dataset. Used to fetch the schema if
                    `schema_dict` is None, and for logging/error context.
        records: A list of dictionaries, where each dictionary is a record
                 to be validated.
        schema_dict: Optional. A pre-fetched JSON schema dictionary. If provided,
                     `get_dataset_schema` will not be called.
        api_key: Optional API key, used by `get_dataset_schema` if `schema_dict` is not
                 provided. Uses the client's default if None.

    Returns:
        dict: A summary of validation results, including counts for total,
              valid, and invalid records, and detailed error information for
              each invalid record.

    Raises:
        ValueError: If `dataset_id` is invalid, `records` is not a list,
                    or if `schema_dict` is provided but is not a valid,
                    non-empty dictionary.
        ApiDataValidationError: If a schema (fetched or provided) is invalid or
                                cannot be used to prepare a validator (e.g., due
                                to `JSONSchemaSchemaError` during preparation).
        DorsalClientError: (And its subclasses like AuthError, NotFoundError,
                           NetworkError, APIError, etc.) Propagated if `get_dataset_schema`
                           is called and encounters an issue.
        JSONSchemaSchemaError: Propagated from `validate_records_with_validator`
                               if the validator's schema has issues found during
                               the record validation loop (should be rare if
                               `prepare_custom_validator` succeeds).
    """
    logger.debug(
        "Initiating record validation for dataset_id: '%s'. %s records. Schema provided: %s. API key: %s. Custom DorsalClient: %s",
        dataset_id,
        (len(records) if isinstance(records, list) else "Invalid 'records' input (not a list)"),
        "Yes" if schema_dict is not None else "No",
        "Yes (user-provided)" if api_key else "No (client default)",
        "Yes" if client else "No",
    )

    if not (isinstance(dataset_id, str) and dataset_id.strip()):
        logger.warning(
            "Dataset ID must be a non-empty string. Got: '%s' (type: %s)",
            dataset_id,
            type(dataset_id).__name__,
        )
        raise ValueError("Dataset ID must be a non-empty string.")

    if not isinstance(records, list):
        logger.warning("Input 'records' must be a list. Got: %s", type(records).__name__)  # type: ignore[unreachable]
        raise ValueError(f"Input 'records' must be a list, got {type(records).__name__}.")  # type: ignore[unreachable]

    actual_schema_to_use: dict
    if schema_dict is not None:
        logger.debug("Using user-provided schema for dataset_id: '%s'.", dataset_id)
        if not isinstance(schema_dict, dict) or not schema_dict:
            logger.warning(
                "Provided schema_dict for dataset_id '%s' must be a non-empty dictionary. Got type: %s",
                dataset_id,
                type(schema_dict).__name__,
            )
            raise ValueError("Provided schema_dict must be a non-empty dictionary.")
        actual_schema_to_use = schema_dict
    else:
        logger.debug("Schema not provided for dataset_id: '%s'; attempting to fetch.", dataset_id)
        try:
            actual_schema_to_use = get_dataset_schema(dataset_id=dataset_id.strip(), api_key=api_key, client=client)
            logger.debug("Successfully fetched schema for dataset_id: '%s'.", dataset_id)
        except Exception as err:
            logger.warning(
                "Failed to fetch schema for dataset_id '%s' (needed for validation): %s - %s",
                dataset_id,
                type(err).__name__,
                err,
            )
            raise

    try:
        validator = get_json_schema_validator(schema=actual_schema_to_use)
        logger.debug("Schema validator prepared successfully for dataset_id: '%s'.", dataset_id)
    except (ValueError, ApiDataValidationError) as err:
        logger.warning(
            "Failed to prepare schema validator for dataset_id '%s' using the schema. Error: %s - %s",
            dataset_id,
            type(err).__name__,
            err,
        )
        raise
    except Exception as err:
        logger.exception(
            "Unexpected error preparing schema validator for dataset_id '%s'.",
            dataset_id,
        )
        raise ApiDataValidationError(
            f"Could not prepare validator for dataset_id '{dataset_id}' due to an unexpected error with the schema or validator setup."
        ) from err

    try:
        validation_summary = json_schema_validate_records(records=records, validator=validator)
        logger.debug(
            "Record validation process completed via helper for dataset_id: '%s'.",
            dataset_id,
        )
        return validation_summary
    except (ValueError, JSONSchemaSchemaError) as err:
        logger.warning(
            "Record validation failed for dataset_id '%s' due to issues within the validator or record structure: %s - %s",
            dataset_id,
            type(err).__name__,
            err,
        )
        raise
    except Exception as err:
        logger.exception(
            "Unexpected error during the record validation stage for dataset_id '%s'.",
            dataset_id,
        )
        raise DorsalError(
            f"An unexpected error occurred while validating records for dataset_id '{dataset_id}'."
        ) from err