Skip to content

Helpers

dorsal.file.helpers

ClassificationLabel

Bases: TypedDict

A dictionary for a single classification label.

build_classification_record

build_classification_record(
    labels,
    vocabulary=None,
    score_explanation=None,
    vocabulary_url=None,
)

Builds a valid 'open/classification' annotation record.

Parameters:

Name Type Description Default
labels list[str | ClassificationLabel]

A list of simple strings (e.g., ["cat", "dog"]) or dictionaries (e.g., [{"label": "cat", "score": 0.95}]).

required
vocabulary list[str] | None

Optional list of all possible labels.

None
score_explanation str | None

Optional string explaining the 'score' field.

None
vocabulary_url str | None

Optional URL pointing to an external vocabulary.

None

Returns:

Type Description
dict[str, Any]

A dictionary structured to match the 'open/classification' schema.

Source code in venv/lib/python3.13/site-packages/dorsal/file/helpers.py
def build_classification_record(
    labels: list[str | ClassificationLabel],
    vocabulary: list[str] | None = None,
    score_explanation: str | None = None,
    vocabulary_url: str | None = None,
) -> dict[str, Any]:
    """
    Builds a valid 'open/classification' annotation record.

    Args:
        labels: A list of simple strings (e.g., ["cat", "dog"]) or
            dictionaries (e.g., [{"label": "cat", "score": 0.95}]).
        vocabulary: Optional list of all possible labels.
        score_explanation: Optional string explaining the 'score' field.
        vocabulary_url: Optional URL pointing to an external vocabulary.

    Returns:
        A dictionary structured to match the 'open/classification' schema.
    """
    processed_labels: list[ClassificationLabel] = []
    if not isinstance(labels, list):
        raise TypeError(f"'labels' must be a list of strings or dictionaries, got {type(labels).__name__}.")

    for item in labels:
        if isinstance(item, str):
            processed_labels.append({"label": item})
        elif isinstance(item, dict):
            if "attributes" in item:
                _validate_attributes(item["attributes"])
            processed_labels.append(item)
        else:
            raise TypeError(
                f"Items in 'labels' list must be a string (str) or a "
                f"dictionary (ClassificationLabel), got {type(item).__name__}."
            )

    if not processed_labels and vocabulary is None and vocabulary_url is None:
        raise ValueError(
            "The 'classification' schema requires 'vocabulary' or 'vocabulary_url' "
            "to be provided when the 'labels' list is empty."
        )

    record_data: dict[str, Any] = {
        "labels": processed_labels,
    }

    if score_explanation is not None:
        record_data["score_explanation"] = score_explanation
    if vocabulary is not None:
        record_data["vocabulary"] = vocabulary
    if vocabulary_url is not None:
        record_data["vocabulary_url"] = vocabulary_url

    return record_data

build_embedding_record

build_embedding_record(
    vector, model=None, target=None, attributes=None
)

Builds a valid 'open/embedding' annotation record.

Parameters:

Name Type Description Default
vector list[float]

The embedding vector.

required
model str | None

Optional name of the model or model used.

None
attributes dict[str, Any] | None

Optional arbitrary metadata (max 16 items, flat).

None

Returns:

Type Description
dict[str, Any]

A dictionary structured to match the 'open/embedding' schema.

Source code in venv/lib/python3.13/site-packages/dorsal/file/helpers.py
def build_embedding_record(
    vector: list[float],
    model: str | None = None,
    target: str | None = None,
    attributes: dict[str, Any] | None = None,
) -> dict[str, Any]:
    """
    Builds a valid 'open/embedding' annotation record.

    Args:
        vector: The embedding vector.
        model: Optional name of the model or model used.
        attributes: Optional arbitrary metadata (max 16 items, flat).

    Returns:
        A dictionary structured to match the 'open/embedding' schema.
    """
    record_data: dict[str, Any] = {
        "vector": vector,
    }
    if model is not None:
        record_data["model"] = model
    if target is not None:
        record_data["target"] = target

    if attributes is not None:
        _validate_attributes(attributes)
        record_data["attributes"] = attributes

    return record_data

build_generic_record

build_generic_record(data, description=None)

Builds a valid 'open/generic' annotation record.

Parameters:

Name Type Description Default
data dict[str, Union[str, int, float, bool, None]]

A flat dictionary of key-value pairs.

required
description str | None

A description of the data (max 256 chars). Can be None.

None

Returns:

Type Description
dict[str, Any]

A dictionary structured to match the 'open/generic' schema.

Source code in venv/lib/python3.13/site-packages/dorsal/file/helpers.py
def build_generic_record(
    data: dict[str, Union[str, int, float, bool, None]],
    description: str | None = None,
) -> dict[str, Any]:
    """
    Builds a valid 'open/generic' annotation record.

    Args:
        data: A flat dictionary of key-value pairs.
        description: A description of the data (max 256 chars). Can be None.

    Returns:
        A dictionary structured to match the 'open/generic' schema.
    """
    if len(data) > 128:
        raise ValueError(f"The 'data' object cannot have more than 128 items. Got {len(data)}.")

    for key, value in data.items():
        if isinstance(value, (dict, list, tuple, set)):
            raise TypeError(
                f"The 'generic' schema disallows nesting. "
                f"Key '{key}' has value of type '{type(value).__name__}', "
                f"but only str, int, float, bool, or None are allowed."
            )

    record_data: dict[str, Any] = {
        "data": data,
    }

    if description is not None:
        record_data["description"] = description

    return record_data

build_llm_output_record

build_llm_output_record(
    model,
    response_data,
    prompt=None,
    language=None,
    score=None,
    score_explanation=None,
    generation_params=None,
    generation_metadata=None,
)

Builds a valid 'open/llm-output' annotation record.

Parameters:

Name Type Description Default
model str

The ID or name of the generative model used.

required
response_data str | dict[str, Any]

The generative output (string or simple dict).

required
prompt str | None

Optional prompt provided to the model.

None
language str | None

Optional 3-letter ISO-639-3 language code.

None
score float | None

Optional confidence or evaluation score [-1, 1].

None
score_explanation str | None

Optional explanation of what the score represents.

None
generation_params dict[str, Any] | None

Optional dict of parameters sent to the API.

None
generation_metadata dict[str, Any] | None

Optional dict of metadata returned by the API.

None

Returns:

Type Description
dict[str, Any]

A dictionary structured to match the 'open/llm-output' schema.

Source code in venv/lib/python3.13/site-packages/dorsal/file/helpers.py
def build_llm_output_record(
    model: str,
    response_data: str | dict[str, Any],
    prompt: str | None = None,
    language: str | None = None,
    score: float | None = None,
    score_explanation: str | None = None,
    generation_params: dict[str, Any] | None = None,
    generation_metadata: dict[str, Any] | None = None,
) -> dict[str, Any]:
    """
    Builds a valid 'open/llm-output' annotation record.

    Args:
        model: The ID or name of the generative model used.
        response_data: The generative output (string or simple dict).
        prompt: Optional prompt provided to the model.
        language: Optional 3-letter ISO-639-3 language code.
        score: Optional confidence or evaluation score [-1, 1].
        score_explanation: Optional explanation of what the score represents.
        generation_params: Optional dict of parameters sent to the API.
        generation_metadata: Optional dict of metadata returned by the API.

    Returns:
        A dictionary structured to match the 'open/llm-output' schema.
    """
    final_response_data: str
    if isinstance(response_data, dict):
        try:
            final_response_data = json.dumps(response_data)
        except TypeError as e:
            raise TypeError(
                f"The 'response_data' dictionary could not be serialized to JSON. "
                f"It may contain non-serializable types. Original error: {e}"
            ) from e
    else:
        final_response_data = response_data

    record_data: dict[str, Any] = {
        "model": model,
        "response_data": final_response_data,
    }

    if prompt is not None:
        record_data["prompt"] = prompt
    if language is not None:
        record_data["language"] = language
    if score is not None:
        record_data["score"] = score
    if score_explanation is not None:
        record_data["score_explanation"] = score_explanation
    if generation_params is not None:
        record_data["generation_params"] = generation_params
    if generation_metadata is not None:
        record_data["generation_metadata"] = generation_metadata

    return record_data

build_location_record

build_location_record(
    longitude,
    latitude,
    id=None,
    timestamp=None,
    camera_make=None,
    camera_model=None,
    bbox=None,
    properties=None,
)

Builds a valid 'open/geolocation' record for a simple Point.

Parameters:

Name Type Description Default
longitude float

The longitude coordinate.

required
latitude float

The latitude coordinate.

required
id str | int | float | None

Optional unique identifier for the feature.

None
timestamp str | None

Optional ISO 8601 timestamp.

None
camera_make str | None

Optional make of the camera/sensor.

None
camera_model str | None

Optional model of the camera/sensor.

None
bbox list[float] | None

Optional Bounding Box array (RFC 7946).

None
properties dict[str, Any] | None

Optional dictionary of additional properties (GeoJSON 'properties'). Must not exceed 100 items.

None

Returns:

Type Description
dict[str, Any]

A dictionary structured to match the 'open/geolocation' schema (GeoJSON Feature).

Source code in venv/lib/python3.13/site-packages/dorsal/file/helpers.py
def build_location_record(
    longitude: float,
    latitude: float,
    id: str | int | float | None = None,
    timestamp: str | None = None,
    camera_make: str | None = None,
    camera_model: str | None = None,
    bbox: list[float] | None = None,
    properties: dict[str, Any] | None = None,
) -> dict[str, Any]:
    """
    Builds a valid 'open/geolocation' record for a simple Point.

    Args:
        longitude: The longitude coordinate.
        latitude: The latitude coordinate.
        id: Optional unique identifier for the feature.
        timestamp: Optional ISO 8601 timestamp.
        camera_make: Optional make of the camera/sensor.
        camera_model: Optional model of the camera/sensor.
        bbox: Optional Bounding Box array (RFC 7946).
        properties: Optional dictionary of additional properties (GeoJSON 'properties').
                   Must not exceed 100 items.

    Returns:
        A dictionary structured to match the 'open/geolocation' schema (GeoJSON Feature).
    """
    feature_properties: dict[str, Any] = {}
    if timestamp is not None:
        feature_properties["timestamp"] = timestamp
    if camera_make is not None:
        feature_properties["camera_make"] = camera_make
    if camera_model is not None:
        feature_properties["camera_model"] = camera_model

    if properties is not None:
        feature_properties.update(properties)

    if len(feature_properties) > 100:
        raise ValueError(f"The 'properties' object cannot have more than 100 items. Got {len(feature_properties)}.")

    record_data: dict[str, Any] = {
        "type": "Feature",
        "geometry": {"type": "Point", "coordinates": [longitude, latitude]},
        "properties": feature_properties if feature_properties else None,
    }

    if id is not None:
        record_data["id"] = id
    if bbox is not None:
        record_data["bbox"] = bbox

    return record_data

build_regression_point

build_regression_point(
    value,
    *,
    statistic=None,
    quantile_level=None,
    interval_lower=None,
    interval_upper=None,
    score=None,
    timestamp=None,
    attributes=None
)

Constructs a validated dictionary for a single regression data point.

This helper is designed to be used when building complex datasets (like time-series or multi-point forecasts) where you need to generate a list of points before wrapping them in a full record.

Parameters:

Name Type Description Default
value float | None

The predicted or sampled value. Can be None to indicate a missing value (e.g. sensor failure or scheduled gap).

required
statistic str

The statistical nature of this value. Must be one of: 'mean', 'median', 'mode', 'min', 'max', 'quantile', 'sample'.

None
quantile_level float

If statistic='quantile', this defines the specific level (e.g., 0.95 for the 95th percentile).

None
interval_lower float

The lower bound of the confidence interval or prediction interval.

None
interval_upper float

The upper bound of the confidence interval or prediction interval.

None
score float

A quality or confidence score for this specific point (0.0 to 1.0).

None
timestamp str | datetime

The specific time this prediction applies to. If a datetime object is provided, it will be automatically formatted as an ISO 8601 string.

None
attributes dict

Arbitrary metadata relevant to this specific point (e.g., {'is_anomaly': True}).

None

Returns:

Type Description
dict[str, Any]

dict[str, Any]: A dictionary representing a single regression point, ready

dict[str, Any]

to be included in the points array of an open/regression record.

Examples:

1. Basic Point

p = build_regression_point(value=42.0)
# {'value': 42.0}

2. Point with Confidence Interval

p = build_regression_point(
    value=105.5,
    interval_lower=100.0,
    interval_upper=110.0,
    statistic="mean"
)

3. Building a Time-Series List

data = [("2025-01-01", 50.0), ("2025-01-02", 55.5)]

points = [
    build_regression_point(value=price, timestamp=date)
    for date, price in data
]
Source code in venv/lib/python3.13/site-packages/dorsal/file/helpers.py
def build_regression_point(
    value: float | None,
    *,
    statistic: str | None = None,
    quantile_level: float | None = None,
    interval_lower: float | None = None,
    interval_upper: float | None = None,
    score: float | None = None,
    timestamp: str | datetime.datetime | None = None,
    attributes: dict[str, Any] | None = None,
) -> dict[str, Any]:
    """
    Constructs a validated dictionary for a single regression data point.

    This helper is designed to be used when building complex datasets (like time-series or
    multi-point forecasts) where you need to generate a list of points before
    wrapping them in a full record.

    Args:
        value (float | None): The predicted or sampled value. Can be `None` to indicate
            a missing value (e.g. sensor failure or scheduled gap).
        statistic (str, optional): The statistical nature of this value.
            Must be one of: `'mean'`, `'median'`, `'mode'`, `'min'`, `'max'`,
            `'quantile'`, `'sample'`.
        quantile_level (float, optional): If `statistic='quantile'`, this defines
            the specific level (e.g., `0.95` for the 95th percentile).
        interval_lower (float, optional): The lower bound of the confidence interval
            or prediction interval.
        interval_upper (float, optional): The upper bound of the confidence interval
            or prediction interval.
        score (float, optional): A quality or confidence score for this specific
            point (0.0 to 1.0).
        timestamp (str | datetime, optional): The specific time this prediction applies to.
            If a `datetime` object is provided, it will be automatically formatted as
            an ISO 8601 string.
        attributes (dict, optional): Arbitrary metadata relevant to this specific point
            (e.g., `{'is_anomaly': True}`).

    Returns:
        dict[str, Any]: A dictionary representing a single regression point, ready
        to be included in the `points` array of an `open/regression` record.

    Examples:
        **1. Basic Point**

        ```python
        p = build_regression_point(value=42.0)
        # {'value': 42.0}
        ```

        **2. Point with Confidence Interval**

        ```python
        p = build_regression_point(
            value=105.5,
            interval_lower=100.0,
            interval_upper=110.0,
            statistic="mean"
        )
        ```

        **3. Building a Time-Series List**

        ```python
        data = [("2025-01-01", 50.0), ("2025-01-02", 55.5)]

        points = [
            build_regression_point(value=price, timestamp=date)
            for date, price in data
        ]
        ```
    """
    point: dict[str, Any] = {"value": value}

    if statistic:
        point["statistic"] = statistic
    if quantile_level is not None:
        point["quantile_level"] = quantile_level
    if interval_lower is not None:
        point["interval_lower"] = interval_lower
    if interval_upper is not None:
        point["interval_upper"] = interval_upper
    if score is not None:
        point["score"] = score
    if timestamp:
        point["timestamp"] = timestamp.isoformat() if hasattr(timestamp, "isoformat") else timestamp
    if attributes:
        point["attributes"] = attributes

    return point

build_regression_record

build_regression_record(
    points,
    *,
    target=None,
    unit=None,
    producer=None,
    score_explanation=None,
    attributes=None
)

Builds a full open/regression record from a list of point dictionaries.

Use this function when you have manually constructed a list of points (e.g. using build_regression_point in a loop) and want to wrap them in the standard record structure with global metadata.

Parameters:

Name Type Description Default
points list[dict]

A list of point dictionaries.

required
target str

The name of the variable being predicted (e.g., 'house_price', 'temperature', 'credit_score').

None
unit str

The unit of measurement (e.g., 'USD', 'celsius', 'kg').

None
producer str

The creator (model, tool, or author) of this regression data.

None
score_explanation str

A description of what the score field represents (e.g., "Model Confidence").

None
attributes dict

Arbitrary metadata relevant to the entire record.

None

Returns:

Type Description
dict[str, Any]

dict[str, Any]: A complete dictionary valid against the open/regression schema.

Examples:

Constructing a Time-Series Record

# 1. Create points
points = [
    build_regression_point(value=10, timestamp="2025-01-01"),
    build_regression_point(value=12, timestamp="2025-01-02")
]

# 2. Build record
record = build_regression_record(
    points=points,
    target="daily_active_users",
    producer="AnalyticsBot v1"
)
Source code in venv/lib/python3.13/site-packages/dorsal/file/helpers.py
def build_regression_record(
    points: list[dict[str, Any]],
    *,
    target: str | None = None,
    unit: str | None = None,
    producer: str | None = None,
    score_explanation: str | None = None,
    attributes: dict[str, Any] | None = None,
) -> dict[str, Any]:
    """
    Builds a full `open/regression` record from a list of point dictionaries.

    Use this function when you have manually constructed a list of points (e.g.
    using `build_regression_point` in a loop) and want to wrap them in the
    standard record structure with global metadata.

    Args:
        points (list[dict]): A list of point dictionaries.
        target (str, optional): The name of the variable being predicted
            (e.g., 'house_price', 'temperature', 'credit_score').
        unit (str, optional): The unit of measurement (e.g., 'USD', 'celsius', 'kg').
        producer (str, optional): The creator (model, tool, or author) of this
            regression data.
        score_explanation (str, optional): A description of what the `score` field
            represents (e.g., "Model Confidence").
        attributes (dict, optional): Arbitrary metadata relevant to the entire record.

    Returns:
        dict[str, Any]: A complete dictionary valid against the `open/regression` schema.

    Examples:
        **Constructing a Time-Series Record**

        ```python
        # 1. Create points
        points = [
            build_regression_point(value=10, timestamp="2025-01-01"),
            build_regression_point(value=12, timestamp="2025-01-02")
        ]

        # 2. Build record
        record = build_regression_record(
            points=points,
            target="daily_active_users",
            producer="AnalyticsBot v1"
        )
        ```
    """
    record: dict[str, Any] = {"points": points}

    if target:
        record["target"] = target
    if unit:
        record["unit"] = unit
    if producer:
        record["producer"] = producer
    if score_explanation:
        record["score_explanation"] = score_explanation
    if attributes:
        record["attributes"] = attributes

    return record

build_single_point_regression_record

build_single_point_regression_record(
    value,
    *,
    target=None,
    unit=None,
    producer=None,
    score_explanation=None,
    statistic=None,
    quantile_level=None,
    interval_lower=None,
    interval_upper=None,
    score=None,
    timestamp=None,
    attributes=None
)

Convenience helper to build a full open/regression record containing exactly one point.

This function abstracts away the points array structure for the common use case of a single scalar prediction or measurement. It combines arguments for both the record (e.g. target) and the point (e.g. value).

Parameters:

Name Type Description Default
value float | None

The predicted or sampled value.

required
target str

The name of the variable being predicted.

None
unit str

The unit of measurement.

None
producer str

The creator of this data.

None
score_explanation str

Description of the score metric.

None
statistic str

The statistical nature of the value.

None
quantile_level float

Level for quantile statistics.

None
interval_lower float

Lower bound of confidence interval.

None
interval_upper float

Upper bound of confidence interval.

None
score float

Quality score for the point.

None
timestamp str | datetime

Time of the prediction.

None
attributes dict

Attributes for the point.

None

Returns:

Type Description
dict[str, Any]

dict[str, Any]: A complete dictionary valid against the open/regression schema,

dict[str, Any]

containing a single item in the points list.

Examples:

Simple Prediction

record = build_single_point_regression_record(
    target="credit_score",
    value=750,
    statistic="mean"
)
Source code in venv/lib/python3.13/site-packages/dorsal/file/helpers.py
def build_single_point_regression_record(
    value: float | None,
    *,
    target: str | None = None,
    unit: str | None = None,
    producer: str | None = None,
    score_explanation: str | None = None,
    statistic: str | None = None,
    quantile_level: float | None = None,
    interval_lower: float | None = None,
    interval_upper: float | None = None,
    score: float | None = None,
    timestamp: str | datetime.datetime | None = None,
    attributes: dict[str, Any] | None = None,
) -> dict[str, Any]:
    """
    Convenience helper to build a full `open/regression` record containing exactly one point.

    This function abstracts away the `points` array structure for the common use case
    of a single scalar prediction or measurement. It combines arguments for both the
    record (e.g. `target`) and the point (e.g. `value`).

    Args:
        value (float | None): The predicted or sampled value.
        target (str, optional): The name of the variable being predicted.
        unit (str, optional): The unit of measurement.
        producer (str, optional): The creator of this data.
        score_explanation (str, optional): Description of the score metric.
        statistic (str, optional): The statistical nature of the value.
        quantile_level (float, optional): Level for quantile statistics.
        interval_lower (float, optional): Lower bound of confidence interval.
        interval_upper (float, optional): Upper bound of confidence interval.
        score (float, optional): Quality score for the point.
        timestamp (str | datetime, optional): Time of the prediction.
        attributes (dict, optional): Attributes for the **point**.

    Returns:
        dict[str, Any]: A complete dictionary valid against the `open/regression` schema,
        containing a single item in the `points` list.

    Examples:
        **Simple Prediction**

        ```python
        record = build_single_point_regression_record(
            target="credit_score",
            value=750,
            statistic="mean"
        )
        ```
    """

    point = build_regression_point(
        value=value,
        statistic=statistic,
        quantile_level=quantile_level,
        interval_lower=interval_lower,
        interval_upper=interval_upper,
        score=score,
        timestamp=timestamp,
        attributes=attributes,
    )

    return build_regression_record(
        points=[point],
        target=target,
        unit=unit,
        producer=producer,
        score_explanation=score_explanation,
    )

build_transcription_record

build_transcription_record(
    text, language=None, track_id=None
)

Builds a simple 'open/audio-transcription' record.

Parameters:

Name Type Description Default
text str

The full transcribed text.

required
language str | None

Optional 3-letter ISO-639-3 language code.

None
track_id str | int | None

Optional identifier for the audio track.

None

Returns:

Type Description
dict[str, Any]

A dictionary structured to match the 'open/audio-transcription' schema.

Source code in venv/lib/python3.13/site-packages/dorsal/file/helpers.py
def build_transcription_record(
    text: str,
    language: str | None = None,
    track_id: str | int | None = None,
) -> dict[str, Any]:
    """
    Builds a simple 'open/audio-transcription' record.

    Args:
        text: The full transcribed text.
        language: Optional 3-letter ISO-639-3 language code.
        track_id: Optional identifier for the audio track.

    Returns:
        A dictionary structured to match the 'open/audio-transcription' schema.
    """
    record_data: dict[str, Any] = {
        "text": text,
    }
    if language is not None:
        record_data["language"] = language
    if track_id is not None:
        record_data["track_id"] = track_id
    return record_data