Skip to content

open/entity-extraction

Title: Entity Extraction

Use Case: A schema for extracted entities, covering both standard NER (text/label) and domain-specific slot-filling (concept/definition).

URL: https://dorsalhub.com/schemas/open/entity-extraction

This schema is used to store structured data about entities identified within a file, ranging from simple named entity recognition (NER) to complex slot-filling.

  • Entities Array: The core of the schema is the entities array, which holds the list of extracted items.
  • Vocabulary: Define allowed labels using the vocabulary list or a vocabulary_url.
  • Entity Details: Each entity requires a label (e.g., 'DATE') and can optionally include the raw text, a unique id, and a confidence score.
  • Business Logic: Supports a concept field for business roles (e.g., 'Plaintiff'), a definition field, and a value field for machine-readable data (string, number, boolean, or null).
  • Location: Entities can be located geometrically (box or polygon) or logically by referencing a specific document block via block_ref.
  • Metadata: Arbitrary metadata key-value pairs can be stored in the attributes object.
{
  "vocabulary_url": "https://example.com/tax-taxonomy/v1",
  "vocabulary": ["ORG", "DATE", "MONEY", "PERSON"],
  "unit": "normalized",
  "entities": [
    {
      "id": "ent_001",
      "text": "Acme Corp",
      "label": "ORG",
      "score": 0.98,
      "location": [
        {
          "type": "block",
          "block_type": "box",
          "page_number": 1,
          "box": {
            "x": 0.1,
            "y": 0.1,
            "width": 0.3,
            "height": 0.05
          }
        }
      ]
    },
    {
      "id": "ent_002",
      "text": "$1,250.00",
      "label": "MONEY",
      "concept": "TotalAmount",
      "definition": "The final sum payable after taxes.",
      "value": 1250.00,
      "score": 0.99,
      "attributes": {
        "currency": "USD",
        "is_estimated": false
      },
      "location": [
        {
          "type": "block_ref",
          "block_id": "block_8f9e2d1c-4b5a"
        }
      ]
    },
    {
      "id": "ent_003",
      "text": "Jane Doe",
      "label": "PERSON",
      "concept": "Recipient",
      "value": "DOE, JANE",
      "location": [
        {
          "type": "block",
          "block_type": "polygon",
          "page_number": 1,
          "polygon": [
            { "x": 0.1, "y": 0.2 },
            { "x": 0.15, "y": 0.2 },
            { "x": 0.15, "y": 0.22 },
            { "x": 0.1, "y": 0.22 }
          ]
        }
      ]
    }
  ]
}
{
    "_license": {
        "id": "Apache-2.0",
        "notice": "Copyright 2025 Dorsal Hub LTD",
        "url": "https://github.com/dorsalhub/open-validation-schemas/blob/main/LICENSE"
    },
    "$schema": "https://json-schema.org/draft/2020-12/schema",
    "$id": "https://dorsalhub.com/schemas/open/entity-extraction",
    "title": "Entity Extraction",
    "version": "0.1.0",
    "description": "Represent named entities, structural slots, or visual concepts extracted from unstructured data. Links raw evidence (text spans or geometric regions) to normalized values and business concepts.",
    "type": "object",
    "properties": {
        "unit": {
            "type": "string",
            "description": "The unit for all coordinate values if geometric data is present.",
            "enum": [
                "px",
                "pt",
                "normalized",
                "per_mille"
            ]
        },
        "producer": {
            "type": "string",
            "description": "The creator (model, tool or author) of this extraction.",
            "maxLength": 1024
        },
        "vocabulary_url": {
            "type": "string",
            "format": "uri",
            "description": "URL to the definition of the entity labels.",
            "maxLength": 2048
        },
        "vocabulary": {
            "type": "array",
            "description": "Allowed values for the 'label' field.",
            "maxItems": 100,
            "items": {
                "type": "string",
                "maxLength": 128
            }
        },
        "score_explanation": {
            "type": "string",
            "description": "Defines the meaning of the 'score' field.",
            "maxLength": 256
        },
        "entities": {
            "type": "array",
            "description": "An array of extracted entities.",
            "maxItems": 100000,
            "items": {
                "type": "object",
                "properties": {
                    "id": {
                        "type": "string",
                        "description": "A unique identifier for this entity instance (e.g. UUID).",
                        "maxLength": 128
                    },
                    "concept": {
                        "type": "string",
                        "description": "The business role or slot this entity fills (e.g. 'InvoiceDate', 'Plaintiff'). Optional.",
                        "maxLength": 128
                    },
                    "label": {
                        "type": "string",
                        "description": "The entity category (e.g. 'PER', 'DATE', 'MONEY').",
                        "maxLength": 128
                    },
                    "text": {
                        "type": "string",
                        "description": "The raw text span as it appears in the source. Optional for visual entities (e.g. logos, signatures).",
                        "maxLength": 4096
                    },
                    "value": {
                        "description": "The machine-readable value (e.g. '2025-11-21', 3500.00, true).",
                        "anyOf": [
                            {
                                "type": "string",
                                "maxLength": 4096
                            },
                            {
                                "type": "number"
                            },
                            {
                                "type": "boolean"
                            },
                            {
                                "type": "null"
                            }
                        ]
                    },
                    "definition": {
                        "type": "string",
                        "description": "Definition of the concept.",
                        "maxLength": 1024
                    },
                    "score": {
                        "type": "number",
                        "description": "The confidence score for this extraction, ranging from 0.0 (uncertain) to 1.0 (certain).",
                        "minimum": 0,
                        "maximum": 1
                    },
                    "location": {
                        "type": "array",
                        "description": "The physical location(s) of the entity.",
                        "maxItems": 10,
                        "items": {
                            "oneOf": [
                                {
                                    "title": "Block Reference",
                                    "type": "object",
                                    "properties": {
                                        "type": {
                                            "const": "block_ref"
                                        },
                                        "block_id": {
                                            "type": "string",
                                            "maxLength": 128
                                        }
                                    },
                                    "required": [
                                        "type",
                                        "block_id"
                                    ],
                                    "additionalProperties": false
                                },
                                {
                                    "title": "Geometric Location",
                                    "type": "object",
                                    "properties": {
                                        "type": {
                                            "const": "block"
                                        },
                                        "block_type": {
                                            "enum": [
                                                "box",
                                                "polygon"
                                            ]
                                        },
                                        "page_number": {
                                            "type": "integer",
                                            "minimum": 1
                                        },
                                        "box": {
                                            "type": "object",
                                            "properties": {
                                                "x": {
                                                    "type": "number"
                                                },
                                                "y": {
                                                    "type": "number"
                                                },
                                                "width": {
                                                    "type": "number"
                                                },
                                                "height": {
                                                    "type": "number"
                                                }
                                            },
                                            "required": [
                                                "x",
                                                "y",
                                                "width",
                                                "height"
                                            ],
                                            "additionalProperties": false
                                        },
                                        "polygon": {
                                            "type": "array",
                                            "minItems": 3,
                                            "maxItems": 100,
                                            "items": {
                                                "type": "object",
                                                "properties": {
                                                    "x": {
                                                        "type": "number"
                                                    },
                                                    "y": {
                                                        "type": "number"
                                                    }
                                                },
                                                "required": [
                                                    "x",
                                                    "y"
                                                ]
                                            }
                                        }
                                    },
                                    "required": [
                                        "type",
                                        "block_type",
                                        "page_number"
                                    ],
                                    "additionalProperties": false,
                                    "allOf": [
                                        {
                                            "if": {
                                                "properties": {
                                                    "block_type": {
                                                        "const": "box"
                                                    }
                                                }
                                            },
                                            "then": {
                                                "required": [
                                                    "box"
                                                ]
                                            }
                                        },
                                        {
                                            "if": {
                                                "properties": {
                                                    "block_type": {
                                                        "const": "polygon"
                                                    }
                                                }
                                            },
                                            "then": {
                                                "required": [
                                                    "polygon"
                                                ]
                                            }
                                        }
                                    ]
                                }
                            ]
                        }
                    },
                    "attributes": {
                        "type": "object",
                        "description": "Arbitrary metadata (e.g. currency, gender). Flat key-value pairs.",
                        "maxProperties": 16,
                        "additionalProperties": {
                            "anyOf": [
                                {
                                    "type": "string",
                                    "maxLength": 1024
                                },
                                {
                                    "type": "number"
                                },
                                {
                                    "type": "boolean"
                                },
                                {
                                    "type": "null"
                                }
                            ]
                        }
                    }
                },
                "required": [
                    "label"
                ],
                "additionalProperties": false
            }
        },
        "attributes": {
            "type": "object",
            "description": "Arbitrary metadata relevant to this extraction.",
            "maxProperties": 16,
            "additionalProperties": {
                "anyOf": [
                    {
                        "type": "string",
                        "maxLength": 1024
                    },
                    {
                        "type": "number"
                    },
                    {
                        "type": "boolean"
                    },
                    {
                        "type": "null"
                    }
                ]
            }
        }
    },
    "required": [
        "entities"
    ],
    "additionalProperties": false,
    "allOf": [
        {
            "if": {
                "properties": {
                    "entities": {
                        "maxItems": 0
                    }
                }
            },
            "then": {
                "anyOf": [
                    {
                        "required": [
                            "vocabulary"
                        ]
                    },
                    {
                        "required": [
                            "vocabulary_url"
                        ]
                    }
                ]
            }
        },
        {
            "if": {
                "properties": {
                    "unit": {
                        "const": "normalized"
                    }
                }
            },
            "then": {
                "properties": {
                    "entities": {
                        "items": {
                            "properties": {
                                "location": {
                                    "items": {
                                        "properties": {
                                            "box": {
                                                "properties": {
                                                    "x": {
                                                        "maximum": 1
                                                    },
                                                    "y": {
                                                        "maximum": 1
                                                    },
                                                    "width": {
                                                        "maximum": 1
                                                    },
                                                    "height": {
                                                        "maximum": 1
                                                    }
                                                }
                                            },
                                            "polygon": {
                                                "items": {
                                                    "properties": {
                                                        "x": {
                                                            "maximum": 1
                                                        },
                                                        "y": {
                                                            "maximum": 1
                                                        }
                                                    }
                                                }
                                            }
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            }
        },
        {
            "if": {
                "properties": {
                    "unit": {
                        "const": "per_mille"
                    }
                }
            },
            "then": {
                "properties": {
                    "entities": {
                        "items": {
                            "properties": {
                                "location": {
                                    "items": {
                                        "properties": {
                                            "box": {
                                                "properties": {
                                                    "x": {
                                                        "maximum": 1000
                                                    },
                                                    "y": {
                                                        "maximum": 1000
                                                    },
                                                    "width": {
                                                        "maximum": 1000
                                                    },
                                                    "height": {
                                                        "maximum": 1000
                                                    }
                                                }
                                            },
                                            "polygon": {
                                                "items": {
                                                    "properties": {
                                                        "x": {
                                                            "maximum": 1000
                                                        },
                                                        "y": {
                                                            "maximum": 1000
                                                        }
                                                    }
                                                }
                                            }
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            }
        }
    ]
}