Reducto Split API

Automatically separate multi-document files and long forms into individual logical units using rules-based Split or Deep Split, then route each unit to downstream Parse, Extract, or Edit operations inside a Pipeline.

OpenAPI Specification

reducto-split-api-openapi.yml Raw ↑
openapi: 3.1.0
info:
  title: Reducto Split API
  version: 1.0.0
  description: Automatically separate multi-document files and long forms into individual units using rules-based or deep
    split.
  contact:
    name: Reducto Support
    email: [email protected]
    url: https://reducto.ai/contact
  license:
    name: Reducto Terms of Service
    url: https://reducto.ai/terms
servers:
- url: https://platform.reducto.ai
  description: Reducto production platform
security:
- SkippableHTTPBearer: []
tags:
- name: Split
paths:
  /split:
    post:
      summary: Split
      operationId: split_split_post
      security:
      - SkippableHTTPBearer: []
      parameters:
      - name: user-id
        in: header
        required: false
        schema:
          anyOf:
          - type: string
          - type: 'null'
          title: User-Id
      requestBody:
        required: true
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/SyncSplitConfig'
      responses:
        '200':
          description: Successful Response
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/SplitResponse'
        '422':
          description: Validation Error
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/HTTPValidationError'
      tags:
      - Split
  /split_async:
    post:
      summary: Split Async
      operationId: split_async_split_async_post
      security:
      - SkippableHTTPBearer: []
      parameters:
      - name: user-id
        in: header
        required: false
        schema:
          anyOf:
          - type: string
          - type: 'null'
          title: User-Id
      requestBody:
        required: true
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/config__v3__AsyncSplitConfig'
      responses:
        '200':
          description: Successful Response
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/AsyncSplitResponse'
        '422':
          description: Validation Error
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/HTTPValidationError'
      tags:
      - Split
components:
  schemas:
    Settings:
      properties:
        ocr_system:
          type: string
          enum:
          - standard
          - legacy
          title: Ocr System
          description: Standard is our best multilingual OCR system. Legacy only supports germanic languages and is available
            for backwards compatibility.
          default: standard
        extraction_mode:
          type: string
          enum:
          - ocr
          - hybrid
          title: Extraction Mode
          description: The mode to use for text extraction from PDFs. OCR mode uses optical character recognition only. Hybrid
            mode combines OCR with embedded PDF text for best accuracy (default).
          default: hybrid
        force_url_result:
          type: boolean
          title: Force Url Result
          description: Force the result to be returned in URL form.
          default: false
        force_file_extension:
          anyOf:
          - type: string
          - type: 'null'
          title: Force File Extension
          description: Force the URL to be downloaded as a specific file extension (e.g. `.png`).
        return_ocr_data:
          type: boolean
          title: Return Ocr Data
          description: If True, return OCR data in the result. Defaults to False.
          default: false
        return_images:
          items:
            type: string
            enum:
            - figure
            - table
            - page
          type: array
          title: Return Images
          description: Whether to return images for the specified block types. 'page' returns full page images. By default,
            no images are returned.
          default: []
        embed_pdf_metadata:
          type: boolean
          title: Embed Pdf Metadata
          description: If True, embed OCR metadata into the returned PDF. Defaults to False.
          default: false
        embed_pdf_metadata_dpi:
          type: integer
          maximum: 250
          minimum: 50
          title: Embed Pdf Metadata Dpi
          description: Render DPI used when rasterizing the source PDF before embedding the OCR text layer (only applies when
            ``embed_pdf_metadata`` is True). Lower values produce dramatically smaller output PDFs; higher values preserve
            more detail when zoomed past 200%. Defaults to 100 (good for on-screen viewing); raise toward the source scan
            DPI for crisper output. Min 50, max 250.
          default: 100
        persist_results:
          type: boolean
          title: Persist Results
          description: If True, persist the results indefinitely. Defaults to False.
          default: false
        timeout:
          anyOf:
          - type: number
          - type: 'null'
          title: Timeout
          description: The timeout for the job in seconds.
        page_range:
          anyOf:
          - $ref: '#/components/schemas/PageRange'
          - items:
              $ref: '#/components/schemas/PageRange'
            type: array
          - items:
              type: integer
            type: array
          - items:
              type: string
            type: array
          - type: 'null'
          title: Page Range
          description: The page range to process (1-indexed). By default, the entire document is processed. For spreadsheets,
            you can also provide a list of sheet names.
        document_password:
          anyOf:
          - type: string
          - type: 'null'
          title: Document Password
          description: Password to decrypt password-protected documents.
      type: object
      title: Settings
    ValidationError:
      properties:
        loc:
          items:
            anyOf:
            - type: string
            - type: integer
          type: array
          title: Location
        msg:
          type: string
          title: Message
        type:
          type: string
          title: Error Type
        input:
          title: Input
        ctx:
          type: object
          title: Context
      type: object
      required:
      - loc
      - msg
      - type
      title: ValidationError
    Enhance:
      properties:
        agentic:
          items:
            anyOf:
            - $ref: '#/components/schemas/TableAgentic'
            - $ref: '#/components/schemas/FigureAgentic'
            - $ref: '#/components/schemas/TextAgentic'
          type: array
          title: Agentic
          description: Agentic uses vision language models to enhance the accuracy of the output of different types of extraction.
            This will incur a cost and latency increase.
          default: []
        summarize_figures:
          type: boolean
          title: Summarize Figures
          description: If True, summarize figures using a small vision language model. Defaults to True.
          default: true
        intelligent_ordering:
          type: boolean
          title: Intelligent Ordering
          description: If True, use an advanced vision language model to improve reading order accuracy, with a small increase
            in latency. Defaults to False.
          default: false
      type: object
      title: Enhance
    SyncSplitConfig:
      properties:
        input:
          anyOf:
          - type: string
          - items:
              type: string
            type: array
          - $ref: '#/components/schemas/UploadResponse'
          title: Input
          description: "For parse/split/extract pipelines, the URL of the document to be processed. You can provide one of\
            \ the following:\n            1. A publicly available URL\n            2. A presigned S3 URL\n            3. A\
            \ reducto:// prefixed URL obtained from the /upload endpoint after directly uploading a document\n           \
            \ 4. A jobid:// prefixed URL obtained from a previous /parse invocation\n            5. A list of URLs (for multi-document\
            \ pipelines, V3 API only)\n\n            For edit pipelines, this should be a string containing the edit instructions "
        parsing:
          $ref: '#/components/schemas/ParseOptions'
          description: The configuration options for parsing the document. If you are passing in a jobid:// URL for the file,
            then this configuration will be ignored.
          default:
            enhance:
              agentic: []
              intelligent_ordering: false
              summarize_figures: true
            retrieval:
              chunking:
                chunk_mode: disabled
                chunk_overlap: 0
              embedding_optimized: false
              filter_blocks: []
            formatting:
              add_page_markers: false
              include: []
              merge_tables: false
              table_output_format: dynamic
            spreadsheet:
              clustering: accurate
              exclude: []
              include: []
              split_large_tables:
                enabled: true
                size: 50
            settings:
              embed_pdf_metadata: false
              embed_pdf_metadata_dpi: 100
              extraction_mode: hybrid
              force_url_result: false
              ocr_system: standard
              persist_results: false
              return_images: []
              return_ocr_data: false
        split_description:
          items:
            $ref: '#/components/schemas/SplitCategory'
          type: array
          title: Split Description
          description: The configuration options for processing the document.
        split_rules:
          type: string
          title: Split Rules
          description: The prompt that describes rules for splitting the document.
          default: Split the document into the applicable sections. Sections may only overlap at their first and last page
            if at all.
        settings:
          $ref: '#/components/schemas/SplitSettings'
          description: The settings for split processing.
          default:
            table_cutoff: truncate
            allow_page_overlap: true
            deep_split: false
      type: object
      required:
      - input
      - split_description
      title: SyncSplitConfig
    Split:
      properties:
        name:
          type: string
          title: Name
        pages:
          items:
            type: integer
          type: array
          title: Pages
        conf:
          type: string
          enum:
          - high
          - low
          title: Conf
          default: low
        partitions:
          anyOf:
          - items:
              $ref: '#/components/schemas/SplitPartition'
            type: array
          - type: 'null'
          title: Partitions
      type: object
      required:
      - name
      - pages
      title: Split
    ParseUsage:
      properties:
        num_pages:
          type: integer
          title: Num Pages
        credits:
          anyOf:
          - type: number
          - type: 'null'
          title: Credits
        credit_breakdown:
          anyOf:
          - additionalProperties:
              type: number
            propertyNames:
              enum:
              - page
              - html_page
              - docx_native_page
              - chart_agent
              - spreadsheet_cells
              - billable_spreadsheet_pages
              - agentic
              - complex
              - enrich_table
              - figure_summary
              - table_summary
              - key_value
              - agentic_text
              - promptable_agentic_text
            type: object
          - type: 'null'
          title: Credit Breakdown
        page_billing_breakdown:
          anyOf:
          - additionalProperties:
              items:
                type: string
                enum:
                - page
                - html_page
                - docx_native_page
                - agentic
                - complex
                - chart_agent
                - spreadsheet_cells
                - billable_spreadsheet_pages
                - enrich_table
                - figure_summary
                - table_summary
                - key_value
                - agentic_text
                - promptable_agentic_text
              type: array
            type: object
          - type: 'null'
          title: Page Billing Breakdown
          description: Per-page breakdown of features used. Maps 1-indexed page numbers (as strings) to the list of billing
            features applied on that page (e.g. 'page', 'complex', 'chart_agent').
      type: object
      required:
      - num_pages
      title: ParseUsage
    config__v3__AsyncConfig:
      properties:
        metadata:
          title: Metadata
          description: JSON metadata included in webhook request body. Defaults to None.
        priority:
          type: boolean
          title: Priority
          description: If True, attempts to process the job with priority if the user has priority processing budget available;
            by default, sync jobs are prioritized above async jobs.
          default: false
        webhook:
          anyOf:
          - $ref: '#/components/schemas/SvixWebhookConfig'
          - $ref: '#/components/schemas/DirectWebhookConfig'
          - type: 'null'
          title: Webhook
          description: The webhook configuration for the asynchronous processing.
      type: object
      title: AsyncConfig
    ParseOptions:
      properties:
        enhance:
          $ref: '#/components/schemas/Enhance'
          default:
            agentic: []
            summarize_figures: true
            intelligent_ordering: false
        retrieval:
          $ref: '#/components/schemas/Retrieval'
          default:
            chunking:
              chunk_mode: disabled
              chunk_overlap: 0
            filter_blocks: []
            embedding_optimized: false
        formatting:
          $ref: '#/components/schemas/Formatting'
          default:
            add_page_markers: false
            table_output_format: dynamic
            merge_tables: false
            include: []
        spreadsheet:
          $ref: '#/components/schemas/Spreadsheet'
          default:
            split_large_tables:
              enabled: true
              size: 50
            include: []
            clustering: accurate
            exclude: []
        settings:
          $ref: '#/components/schemas/Settings'
          default:
            ocr_system: standard
            extraction_mode: hybrid
            force_url_result: false
            return_ocr_data: false
            return_images: []
            embed_pdf_metadata: false
            embed_pdf_metadata_dpi: 100
            persist_results: false
      type: object
      title: ParseOptions
    Spreadsheet:
      properties:
        split_large_tables:
          $ref: '#/components/schemas/SplitLargeTables'
          default:
            enabled: true
            size: 50
        include:
          items:
            type: string
            enum:
            - cell_colors
            - formula
            - dropdowns
          type: array
          title: Include
          description: Whether to include cell color, formula, and dropdown information in the output.
          default: []
        clustering:
          type: string
          enum:
          - accurate
          - fast
          - disabled
          title: Clustering
          description: "In a spreadsheet with different tables inside, we enable splitting up the tables by default. Accurate\
            \ mode applies more powerful models for superior accuracy, at 5\xD7 the default per-cell rate. Disabling will\
            \ register as one large table."
          default: accurate
        exclude:
          items:
            type: string
            enum:
            - hidden_sheets
            - hidden_rows
            - hidden_cols
            - styling
            - spreadsheet_images
          type: array
          title: Exclude
          description: Whether to exclude hidden sheets, rows, or columns in the output.
          default: []
      type: object
      title: Spreadsheet
    FigureAgentic:
      properties:
        scope:
          type: string
          const: figure
          title: Scope
        prompt:
          anyOf:
          - type: string
          - type: 'null'
          title: Prompt
          description: Custom prompt for figure agentic.
        advanced_chart_agent:
          type: boolean
          title: Advanced Chart Agent
          description: If True, use the advanced chart agent. Defaults to False.
          default: false
        return_overlays:
          type: boolean
          title: Return Overlays
          description: If True, return overlays for the figure. This is so you can use the overlays to double check the quality
            of the extraction
          default: false
      type: object
      required:
      - scope
      title: FigureAgentic
    SvixWebhookConfig:
      properties:
        mode:
          type: string
          const: svix
          title: Mode
          default: svix
        channels:
          items:
            type: string
          type: array
          title: Channels
          description: A list of Svix channels the message will be delivered down, omit to send to all channels.
      type: object
      title: SvixWebhookConfig
    SplitSettings:
      properties:
        table_cutoff:
          type: string
          enum:
          - truncate
          - preserve
          title: Table Cutoff
          description: If tables should be truncated to the first few rows or if all content should be preserved. truncate
            improves latency, preserve is recommended for cases where partition_key is being used and the partition_key may
            be included within the table. Defaults to truncate
          default: truncate
        allow_page_overlap:
          type: boolean
          title: Allow Page Overlap
          description: If True, a page can belong to multiple categories/partitions. If False, each page must belong to exactly
            one category. Defaults to True.
          default: true
        deep_split:
          type: boolean
          title: Deep Split
          description: If True, uses the deep split agent for higher-quality document splitting. Off by default.
          default: false
      type: object
      title: SplitSettings
    Chunking:
      properties:
        chunk_mode:
          type: string
          enum:
          - variable
          - section
          - page
          - disabled
          - block
          - page_sections
          title: Chunk Mode
          description: Choose how to partition chunks. Variable mode chunks by character length and visual context. Section
            mode chunks by section headers. Page mode chunks according to pages. Page sections mode chunks first by page,
            then by sections within each page. Disabled returns one single chunk.
          default: disabled
        chunk_size:
          anyOf:
          - type: integer
          - type: 'null'
          title: Chunk Size
          description: The approximate size of chunks (in characters) that the document will be split into. Defaults to null,
            in which case the chunk size is variable between 250 - 1500 characters.
        chunk_overlap:
          type: integer
          title: Chunk Overlap
          description: Number of characters of overlap to include from adjacent chunks. Defaults to 0.
          default: 0
      type: object
      title: Chunking
    SplitResult:
      properties:
        section_mapping:
          anyOf:
          - additionalProperties:
              items:
                type: integer
              type: array
            type: object
          - type: 'null'
          title: Section Mapping
        splits:
          items:
            $ref: '#/components/schemas/Split'
          type: array
          title: Splits
      type: object
      required:
      - section_mapping
      - splits
      title: SplitResult
    SplitPartition:
      properties:
        name:
          type: string
          title: Name
        pages:
          items:
            type: integer
          type: array
          title: Pages
        conf:
          type: string
          enum:
          - high
          - low
          title: Conf
          default: low
      type: object
      required:
      - name
      - pages
      title: SplitPartition
    TextAgentic:
      properties:
        scope:
          type: string
          const: text
          title: Scope
        prompt:
          anyOf:
          - type: string
          - type: 'null'
          title: Prompt
          description: 'Custom instructions for agentic text. Note: This only applies to form regions (key-value).'
      type: object
      required:
      - scope
      title: TextAgentic
    DirectWebhookConfig:
      properties:
        mode:
          type: string
          const: direct
          title: Mode
          default: direct
        url:
          type: string
          title: Url
      type: object
      required:
      - url
      title: DirectWebhookConfig
    TableAgentic:
      properties:
        scope:
          type: string
          const: table
          title: Scope
        prompt:
          anyOf:
          - type: string
          - type: 'null'
          title: Prompt
          description: Custom prompt for table agentic.
        mode:
          type: string
          enum:
          - default
          - auto
          title: Mode
          description: 'Routing mode for table agentic: ''default'' runs enrichment on all tables, ''auto'' uses the router
            to skip tables where enrichment is unlikely to help.'
          default: default
      type: object
      required:
      - scope
      title: TableAgentic
    SplitLargeTables:
      properties:
        enabled:
          type: boolean
          title: Enabled
          description: If True, split large tables into smaller tables. Defaults to True.
          default: true
        size:
          anyOf:
          - type: integer
          - $ref: '#/components/schemas/SplitLargeTableSizes'
          title: Size
          description: The size of the tables to split into. Defaults to 50. Use 'row' and 'column' to independently specify
            the number of rows and columns to include when splitting. If you only want to split by rows or columns, set the
            other value to None.
          default: 50
      type: object
      title: SplitLargeTables
    PageRange:
      properties:
        start:
          anyOf:
          - type: integer
          - type: 'null'
          title: Start
          description: The page number to start processing from (1-indexed).
        end:
          anyOf:
          - type: integer
          - type: 'null'
          title: End
          description: The page number to stop processing at (1-indexed).
      type: object
      title: PageRange
    DeepSplitPartition:
      properties:
        name:
          type: string
          title: Name
        pages:
          items:
            $ref: '#/components/schemas/DeepSplitPageEvidence'
          type: array
          title: Pages
      type: object
      required:
      - name
      - pages
      title: DeepSplitPartition
    SplitResponse:
      properties:
        response_type:
          type: string
          const: split
          title: Response Type
          default: split
        usage:
          $ref: '#/components/schemas/ParseUsage'
        result:
          anyOf:
          - $ref: '#/components/schemas/SplitResult'
          - $ref: '#/components/schemas/DeepSplitResult'
          title: Result
          description: The split result.
      type: object
      required:
      - usage
      - result
      title: SplitResponse
    DeepSplitResult:
      properties:
        splits:
          items:
            $ref: '#/components/schemas/DeepSplit'
          type: array
          title: Splits
      type: object
      required:
      - splits
      title: DeepSplitResult
    SplitCategory:
      properties:
        name:
          type: string
          title: Name
        description:
          type: string
          title: Description
        partition_key:
          anyOf:
          - type: string
          - type: 'null'
          title: Partition Key
      type: object
      required:
      - name
      - description
      title: SplitCategory
    UploadResponse:
      properties:
        file_id:
          type: string
          title: File Id
        presigned_url:
          anyOf:
          - type: string
          - type: 'null'
          title: Presigned Url
      type: object
      required:
      - file_id
      title: UploadResponse
    SplitLargeTableSizes:
      properties:
        row:
          anyOf:
          - type: integer
          - type: 'null'
          title: Row
          description: The number of rows to include in each chunk when splitting large tables. Does not chunk rows if set
            to None.
        column:
          anyOf:
          - type: integer
          - type: 'null'
          title: Column
          description: The number of columns to include in each chunk when splitting large tables. Does not chunk columns
            if set to None.
      type: object
      title: SplitLargeTableSizes
    Retrieval:
      properties:
        chunking:
          $ref: '#/components/schemas/Chunking'
          default:
            chunk_mode: disabled
            chunk_overlap: 0
        filter_blocks:
          items:
            type: string
            enum:
            - Header
            - Footer
            - Title
            - Section Header
            - Page Number
            - List Item
            - Figure
            - Table
            - Key Value
            - Text
            - Comment
            - Signature
          type: array
          title: Filter Blocks
          description: A list of block types to filter out from 'content' and 'embed' fields. By default, no blocks are filtered.
          default: []
        embedding_optimized:
          type: boolean
          title: Embedding Optimized
          description: If True, use embedding optimized mode. Defaults to False.
          default: false
      type: object
      title: Retrieval
    AsyncSplitResponse:
      properties:
        job_id:
          type: string
          title: Job Id
      type: object
      required:
      - job_id
      title: AsyncSplitResponse
    Formatting:
      properties:
        add_page_markers:
          type: boolean
          title: Add Page Markers
          description: If True, add page markers to the output. Defaults to False. Useful for extracting data with page specific
            information.
          default: false
        table_output_format:
          type: string
          enum:
          - html
          - json
          - md
          - jsonbbox
          - dynamic
          - csv
          title: Table Output Format
          description: The mode to use for table output. Defaults to dynamic, which returns md for simpler tables and html
            for more complex tables.
          default: dynamic
        merge_tables:
          type: boolean
          title: Merge Tables
          description: A flag to indicate if consecutive tables with the same number of columns should be merged. Defaults
            to False.
          default: false
        include:
          items:
            type: string
            enum:
            - change_tracking
            - highlight
            - comments
            - hyperlinks
            - signatures
            - ignore_watermarks
          type: array
          title: Include
          description: A list of formatting to include in the output.
          default: []
      type: object
      title: Formatting
    config__v3__AsyncSplitConfig:
      properties:
        async:
          $ref: '#/components/schemas/config__v3__AsyncConfig'
          description: The configuration options for asynchronous processing (default synchronous).
          default:
            priority: false
        input:
          anyOf:
          - type: string
          - items:
              type: string
            type: array
          - $ref: '#/components/schemas/UploadResponse'
          title: Input
          description: "For parse/split/extract pipelines, the URL of the document to be processed. You can provide one of\
            \ the following:\n            1. A publicly available URL\n            2. A presigned S3 URL\n            3. A\
            \ reducto:// prefixed URL obtained from the /upload endpoint after directly uploading a document\n           \
            \ 4. A jobid:// prefixed URL obtained from a previous /parse invocation\n            5. A list of URLs (for multi-document\
            \ pipelines, V3 API only)\n\n            For edit pipelines, this should be a string containing the edit instructions "
        parsing:
          $ref: '#/components/schemas/ParseOptions'
          description: The configuration options for parsing the document. If you are passing in a jobid:// URL for the file,
            then this configuration will be ignored.
          default:
            enhance:
              agentic: []
              intelligent_ordering: false
              summarize_figures: true
            retrieval:
              chunking:
                chunk_mode: disabled
                chunk_overlap: 0
              embedding_optimized: false
              filter_blocks: []
            formatting:
              add_page_markers: false
              include: []
              merge_tables: false
              table_output_format: dynamic
            spreadsheet:
              clustering: accurate
              exclude: []
              include: []
              split_large_tables:
                enabled: true
                size: 50
            settings:
              embed_pdf_metadata: false
              embed_pdf_metadata_dpi: 100
              extraction_mode: hybrid
              force_url_result: false
              ocr_system: standard
              persist_results: false
              return_images: []
              return_ocr_data: false
        split_description:
          items:
            $ref: '#/components/schemas/SplitCategory'
          type: array
          title: Split Description
          description: The configuration options for processing the document.
        split_rules:
          type: string
          title: Split Rules
          description: The prompt that describes rules for splitting the document.
          default: Split the document into the applicable sections. Sections may only overlap at their first and last page
            if at all.
        settings:
          $ref: '#/components/schemas/SplitSettings'
          description: The settings for split processing.
          default:
            table_cutoff: truncate
            allow_page_overlap: true
            deep_split: false
      type: object
      required:
      - input
      - split_description
      title: AsyncSplitConfig
    DeepSplitPageEvidence:
      properties:
        page_number:
          type: integer
          title: Page Number
        evidence:
          type: string
          title: Evidence
        confidence:
          anyOf:
          - type: string
            enum:
            - high
            - medium
            - low
          - type: 'null'
          title: Confidence
      type: object
      required:
      - page_number
      - evidence
      title: DeepSplitPageEvidence
    DeepSplit:
      properties:
        name:
          type: string
          title: Name
        pages:
          items:
            $ref: '#/components/schemas/DeepSplitPageEvidence'
          type: array
          title: Pages
        partitions:
          anyOf:
          - items:
              $ref: '#/components/schemas/DeepSplitPartition'
            type: array
          - type: 'null'
          title: Partitions
      type: object
      required:
      - name
      - pages
    

# --- truncated at 32 KB (32 KB total) ---
# Full source: https://raw.githubusercontent.com/api-evangelist/reducto-ai/refs/heads/main/openapi/reducto-split-api-openapi.yml