Skip to content

open/document-extraction

Title: Document Extraction

Use Case: Text extracted from documents. Includes text, as well as structural information such as line numbers, bounding boxes and polygons.

URL: https://dorsalhub.com/schemas/open/document-extraction

This schema stores the content of a document (like a PDF or image) as a structured list of text blocks.

  • Extraction Type: A top-level extraction_type is required (e.g., "text", "lines", "mixed").
  • Blocks Array: All content is stored within an array named blocks.
  • Block Content: Each block object must have text content and a block_type (e.g., "text", "line", "box", or "polygon").
  • Location: Blocks can specify their location using page_number, line_number, and optional geometry (box or polygon).
  • Coordinates: If extraction includes geometric data (like boxes), a top-level unit field is required.
  • Additional Metadata: Each block supports a score and an attributes object for custom metadata.
{
    "extraction_type": "mixed",
    "unit": "pt",
    "blocks": [
        {
            "block_type": "text",
            "id": "c7a4b0c8-3b1a-4b6f-8c3b-2a7e1a3e4b5c",
            "text": "Annual Report Summary",
            "page_number": 1
        },
        {
            "block_type": "box",
            "id": "d8b5c1d9-4c2b-4c7f-9d4c-3b8f2b4f5c6d",
            "text": "The first column of text begins here, discussing the quarterly earnings and projections for the upcoming fiscal year.",
            "page_number": 1,
            "box": {
                "x": 50,
                "y": 120,
                "width": 250,
                "height": 400
            }
        },
        {
            "block_type": "box",
            "id": "e9c6d2e0-5d3c-4d8b-a0e5-4c9c3c5c6d7e",
            "text": "The second column contains shareholder information and a summary of the annual general meeting.",
            "page_number": 1,
            "box": {
                "x": 320,
                "y": 120,
                "width": 250,
                "height": 400
            }
        },
        {
            "block_type": "polygon",
            "id": "f0d7e3f1-6e4d-4e9a-b1f6-5d0b4d6b7e8f",
            "text": "This caption is wrapped around a circular diagram on the page.",
            "page_number": 2,
            "polygon": [
                {
                    "x": 100,
                    "y": 100
                },
                {
                    "x": 500,
                    "y": 100
                },
                {
                    "x": 500,
                    "y": 250
                },
                {
                    "x": 350,
                    "y": 250
                },
                {
                    "x": 350,
                    "y": 150
                },
                {
                    "x": 100,
                    "y": 150
                }
            ]
        }
    ]
}
{
    "_license": {
        "id": "Apache-2.0",
        "notice": "Copyright 2025 Dorsal Hub LTD",
        "url": "https://github.com/dorsalhub/open-validation-schemas/blob/main/LICENSE"
    },
    "$schema": "https://json-schema.org/draft/2020-12/schema",
    "$id": "https://dorsalhub.com/schemas/open/document-extraction",
    "title": "Document Extraction",
    "version": "0.1.0",
    "description": "Represent the layout and content of a document, including text blocks, geometric coordinates, and page structure.",
    "type": "object",
    "properties": {
        "extraction_type": {
            "type": "string",
            "description": "Optionally indicate the extraction type.",
            "enum": [
                "text",
                "lines",
                "boxes",
                "polygons",
                "mixed"
            ]
        },
        "producer": {
            "type": "string",
            "description": "The creator (model, tool or author) of this extraction.",
            "maxLength": 1024
        },
        "unit": {
            "type": "string",
            "description": "The unit for all coordinate values if geometric data is present.",
            "enum": [
                "px",
                "pt",
                "normalized",
                "per_mille"
            ]
        },
        "score_explanation": {
            "type": "string",
            "description": "Defines the meaning of the 'score' field.",
            "maxLength": 256
        },
        "blocks": {
            "type": "array",
            "description": "An array of text blocks extracted from the document.",
            "maxItems": 100000,
            "items": {
                "type": "object",
                "properties": {
                    "block_type": {
                        "type": "string",
                        "description": "Defines the nature of this specific block (text, line, box, or polygon).",
                        "enum": [
                            "text",
                            "line",
                            "box",
                            "polygon"
                        ]
                    },
                    "id": {
                        "type": "string",
                        "description": "A unique identifier (e.g., UUID4) for this block, useful for referencing.",
                        "maxLength": 128
                    },
                    "text": {
                        "type": "string",
                        "description": "The text content of the block.",
                        "maxLength": 4096
                    },
                    "line_number": {
                        "type": "integer",
                        "description": "The line number of the block within its page, if applicable.",
                        "minimum": 1
                    },
                    "page_number": {
                        "type": "integer",
                        "description": "The page number (1-indexed) where this block is located.",
                        "minimum": 1
                    },
                    "score": {
                        "type": "number",
                        "description": "The confidence score for this block's detection and transcription, ranging from 0.0 (uncertain) to 1.0 (certain).",
                        "minimum": 0,
                        "maximum": 1
                    },
                    "box": {
                        "type": "object",
                        "description": "A rectangular bounding box defined by its top-left corner (x,y) and its dimensions.",
                        "properties": {
                            "x": {
                                "type": "number",
                                "minimum": 0,
                                "description": "The x-coordinate of the top-left corner."
                            },
                            "y": {
                                "type": "number",
                                "minimum": 0,
                                "description": "The y-coordinate of the top-left corner."
                            },
                            "width": {
                                "type": "number",
                                "minimum": 0,
                                "description": "The width of the box."
                            },
                            "height": {
                                "type": "number",
                                "minimum": 0,
                                "description": "The height of the box."
                            }
                        },
                        "required": [
                            "x",
                            "y",
                            "width",
                            "height"
                        ],
                        "additionalProperties": false
                    },
                    "polygon": {
                        "type": "array",
                        "description": "An array of coordinate points defining the block's boundary for non-rectangular shapes.",
                        "maxItems": 100,
                        "minItems": 3,
                        "items": {
                            "type": "object",
                            "properties": {
                                "x": {
                                    "type": "number",
                                    "minimum": 0,
                                    "description": "The x-coordinate of a vertex point."
                                },
                                "y": {
                                    "type": "number",
                                    "minimum": 0,
                                    "description": "The y-coordinate of a vertex point."
                                }
                            },
                            "required": [
                                "x",
                                "y"
                            ],
                            "additionalProperties": false
                        }
                    },
                    "attributes": {
                        "type": "object",
                        "description": "Arbitrary metadata relevant to this block.",
                        "maxProperties": 16,
                        "additionalProperties": {
                            "anyOf": [
                                {
                                    "type": "string",
                                    "maxLength": 1024
                                },
                                {
                                    "type": "number"
                                },
                                {
                                    "type": "boolean"
                                },
                                {
                                    "type": "null"
                                }
                            ]
                        }
                    }
                },
                "required": [
                    "text",
                    "block_type"
                ],
                "additionalProperties": false,
                "allOf": [
                    {
                        "if": {
                            "properties": {
                                "block_type": {
                                    "const": "box"
                                }
                            }
                        },
                        "then": {
                            "required": [
                                "box"
                            ]
                        }
                    },
                    {
                        "if": {
                            "properties": {
                                "block_type": {
                                    "const": "polygon"
                                }
                            }
                        },
                        "then": {
                            "required": [
                                "polygon"
                            ]
                        }
                    },
                    {
                        "if": {
                            "properties": {
                                "block_type": {
                                    "const": "line"
                                }
                            }
                        },
                        "then": {
                            "anyOf": [
                                {
                                    "required": [
                                        "box"
                                    ]
                                },
                                {
                                    "required": [
                                        "polygon"
                                    ]
                                }
                            ]
                        }
                    }
                ]
            }
        },
        "attributes": {
            "type": "object",
            "description": "Arbitrary metadata relevant to this extraction.",
            "maxProperties": 16,
            "additionalProperties": {
                "anyOf": [
                    {
                        "type": "string",
                        "maxLength": 1024
                    },
                    {
                        "type": "number"
                    },
                    {
                        "type": "boolean"
                    },
                    {
                        "type": "null"
                    }
                ]
            }
        }
    },
    "required": [
        "extraction_type",
        "blocks"
    ],
    "additionalProperties": false,
    "oneOf": [
        {
            "properties": {
                "extraction_type": {
                    "enum": [
                        "text",
                        "lines"
                    ]
                }
            }
        },
        {
            "properties": {
                "extraction_type": {
                    "enum": [
                        "boxes",
                        "polygons",
                        "mixed"
                    ]
                }
            },
            "required": [
                "unit"
            ]
        }
    ],
    "allOf": [
        {
            "if": {
                "properties": {
                    "unit": {
                        "const": "normalized"
                    }
                }
            },
            "then": {
                "properties": {
                    "blocks": {
                        "items": {
                            "properties": {
                                "box": {
                                    "properties": {
                                        "x": {
                                            "maximum": 1
                                        },
                                        "y": {
                                            "maximum": 1
                                        },
                                        "width": {
                                            "maximum": 1
                                        },
                                        "height": {
                                            "maximum": 1
                                        }
                                    }
                                },
                                "polygon": {
                                    "items": {
                                        "properties": {
                                            "x": {
                                                "maximum": 1
                                            },
                                            "y": {
                                                "maximum": 1
                                            }
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            }
        },
        {
            "if": {
                "properties": {
                    "unit": {
                        "const": "per_mille"
                    }
                }
            },
            "then": {
                "properties": {
                    "blocks": {
                        "items": {
                            "properties": {
                                "box": {
                                    "properties": {
                                        "x": {
                                            "maximum": 1000
                                        },
                                        "y": {
                                            "maximum": 1000
                                        },
                                        "width": {
                                            "maximum": 1000
                                        },
                                        "height": {
                                            "maximum": 1000
                                        }
                                    }
                                },
                                "polygon": {
                                    "items": {
                                        "properties": {
                                            "x": {
                                                "maximum": 1000
                                            },
                                            "y": {
                                                "maximum": 1000
                                            }
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            }
        }
    ]
}