Hugging Face Text Generation Inference API

High-performance toolkit for deploying and serving large language models with optimized inference.

OpenAPI Specification

hugging-face-text-generation-inference-api.yml Raw ↑
openapi: 3.1.0
info:
  title: Hugging Face Text Generation Inference API
  description: >-
    High-performance toolkit for deploying and serving large language models with
    optimized inference. Provides both a custom TGI API and an OpenAI-compatible
    Messages API for chat completions. Supports streaming, tool calling,
    structured output, grammar constraints, and multi-modal inputs.
  version: 1.0.0
  termsOfService: https://huggingface.co/terms-of-service
  contact:
    name: Hugging Face Support
    url: https://huggingface.co/support
  license:
    name: Apache 2.0
    url: https://www.apache.org/licenses/LICENSE-2.0
servers:
- url: https://api-inference.huggingface.co
  description: Hugging Face hosted TGI server
- url: http://localhost:8080
  description: Local TGI server (self-hosted)
security:
- bearerAuth: []
tags:
- name: Text Generation
  description: TGI native text generation endpoints
- name: Chat
  description: OpenAI-compatible chat completion endpoints
- name: Info
  description: Server and model information
paths:
  /generate:
    post:
      summary: Generate Text
      description: >-
        Generate text from a prompt using the loaded model. Returns the full
        generated text in a single response.
      operationId: generate
      tags:
      - Text Generation
      requestBody:
        required: true
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/GenerateRequest'
            examples:
              GenerateRequestExample:
                summary: Default generate request
                x-microcks-default: true
                value:
                  inputs: example_value
                  parameters:
                    max_new_tokens: 10
                    temperature: 42.5
                    top_p: 42.5
                    top_k: 10
                    repetition_penalty: 42.5
                    do_sample: true
                    seed: 10
                    stop:
                    - example_value
                    watermark: true
                    return_full_text: true
                    decoder_input_details: true
                    details: true
                    truncate: 10
                    typical_p: 42.5
                    best_of: 10
                    grammar:
                      type: json
                      value: example_value
      responses:
        '200':
          description: Generated text
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/GenerateResponse'
              examples:
                Generate200Example:
                  summary: Default generate 200 response
                  x-microcks-default: true
                  value:
                    generated_text: example_value
                    details:
                      finish_reason: length
                      generated_tokens: 10
                      seed: 10
                      prefill:
                      - id: abc123
                        text: example_value
                        logprob: 42.5
                      tokens:
                      - id: abc123
                        text: example_value
                        logprob: 42.5
                        special: true
                      best_of_sequences:
                      - {}
          headers:
            x-compute-type:
              schema:
                type: string
              description: Compute backend used
            x-compute-characters:
              schema:
                type: integer
              description: Number of input characters
            x-total-time:
              schema:
                type: number
              description: Total inference time in milliseconds
            x-validation-time:
              schema:
                type: number
              description: Input validation time in milliseconds
            x-queue-time:
              schema:
                type: number
              description: Queue wait time in milliseconds
            x-inference-time:
              schema:
                type: number
              description: Model inference time in milliseconds
            x-time-per-token:
              schema:
                type: number
              description: Average time per generated token in milliseconds
            x-prompt-tokens:
              schema:
                type: integer
              description: Number of prompt tokens
            x-generated-tokens:
              schema:
                type: integer
              description: Number of generated tokens
        '400':
          description: Invalid input - validation error
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/ErrorResponse'
              examples:
                Generate400Example:
                  summary: Default generate 400 response
                  x-microcks-default: true
                  value:
                    error: example_value
                    error_type: example_value
        '422':
          description: Generation error
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/ErrorResponse'
              examples:
                Generate422Example:
                  summary: Default generate 422 response
                  x-microcks-default: true
                  value:
                    error: example_value
                    error_type: example_value
        '429':
          description: Model is overloaded
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/ErrorResponse'
              examples:
                Generate429Example:
                  summary: Default generate 429 response
                  x-microcks-default: true
                  value:
                    error: example_value
                    error_type: example_value
      x-microcks-operation:
        delay: 0
        dispatcher: FALLBACK
  /generate_stream:
    post:
      summary: Generate Text With Streaming
      description: >-
        Generate text from a prompt using Server-Sent Events (SSE) streaming.
        Tokens are returned incrementally as they are generated.
      operationId: generateStream
      tags:
      - Text Generation
      requestBody:
        required: true
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/GenerateRequest'
            examples:
              GeneratestreamRequestExample:
                summary: Default generateStream request
                x-microcks-default: true
                value:
                  inputs: example_value
                  parameters:
                    max_new_tokens: 10
                    temperature: 42.5
                    top_p: 42.5
                    top_k: 10
                    repetition_penalty: 42.5
                    do_sample: true
                    seed: 10
                    stop:
                    - example_value
                    watermark: true
                    return_full_text: true
                    decoder_input_details: true
                    details: true
                    truncate: 10
                    typical_p: 42.5
                    best_of: 10
                    grammar:
                      type: json
                      value: example_value
      responses:
        '200':
          description: Stream of generated tokens
          content:
            text/event-stream:
              schema:
                $ref: '#/components/schemas/StreamResponse'
              examples:
                Generatestream200Example:
                  summary: Default generateStream 200 response
                  x-microcks-default: true
                  value:
                    token:
                      id: abc123
                      text: example_value
                      logprob: 42.5
                      special: true
                    generated_text: example_value
                    details:
                      finish_reason: example_value
                      generated_tokens: 10
                      seed: 10
        '400':
          description: Invalid input
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/ErrorResponse'
              examples:
                Generatestream400Example:
                  summary: Default generateStream 400 response
                  x-microcks-default: true
                  value:
                    error: example_value
                    error_type: example_value
        '429':
          description: Model is overloaded
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/ErrorResponse'
              examples:
                Generatestream429Example:
                  summary: Default generateStream 429 response
                  x-microcks-default: true
                  value:
                    error: example_value
                    error_type: example_value
      x-microcks-operation:
        delay: 0
        dispatcher: FALLBACK
  /v1/chat/completions:
    post:
      summary: Create Chat Completion (openai Compatible)
      description: >-
        Create a chat completion using the OpenAI-compatible Messages API.
        Supports conversations, tool calling, structured output via JSON schema,
        and multi-modal inputs (text + images). Fully compatible with OpenAI
        client libraries.
      operationId: chatCompletions
      tags:
      - Chat
      requestBody:
        required: true
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/ChatCompletionRequest'
            examples:
              ChatcompletionsRequestExample:
                summary: Default chatCompletions request
                x-microcks-default: true
                value:
                  model: example_value
                  messages:
                  - role: system
                    content: example_value
                    tool_calls:
                    - {}
                    tool_call_id: '500123'
                    name: Example Title
                  frequency_penalty: 42.5
                  logprobs: true
                  max_tokens: 10
                  presence_penalty: 42.5
                  response_format: example_value
                  seed: 10
                  stop:
                  - example_value
                  stream: true
                  stream_options:
                    include_usage: true
                  temperature: 42.5
                  tool_choice: example_value
                  tool_prompt: example_value
                  tools:
                  - type: example_value
                    function:
                      name: Example Title
                      description: A sample description.
                      parameters: example_value
                  top_logprobs: 10
                  top_p: 42.5
      responses:
        '200':
          description: Chat completion response
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/ChatCompletionResponse'
              examples:
                Chatcompletions200Example:
                  summary: Default chatCompletions 200 response
                  x-microcks-default: true
                  value:
                    id: abc123
                    object: example_value
                    created: 10
                    model: example_value
                    system_fingerprint: example_value
                    choices:
                    - index: 10
                      message:
                        role: example_value
                        content: example_value
                        tool_calls: {}
                        tool_call_id: '500123'
                      finish_reason: stop
                      logprobs:
                        content: {}
                    usage:
                      prompt_tokens: 10
                      completion_tokens: 10
                      total_tokens: 10
            text/event-stream:
              schema:
                $ref: '#/components/schemas/ChatCompletionStreamResponse'
              examples:
                Chatcompletions200Example:
                  summary: Default chatCompletions 200 response
                  x-microcks-default: true
                  value:
                    id: abc123
                    object: example_value
                    created: 10
                    model: example_value
                    system_fingerprint: example_value
                    choices:
                    - index: 10
                      delta:
                        role: example_value
                        content: example_value
                        tool_calls: {}
                        tool_call_id: '500123'
                      finish_reason: example_value
                      logprobs:
                        content: {}
                    usage:
                      prompt_tokens: 10
                      completion_tokens: 10
                      total_tokens: 10
        '400':
          description: Bad request
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/ErrorResponse'
              examples:
                Chatcompletions400Example:
                  summary: Default chatCompletions 400 response
                  x-microcks-default: true
                  value:
                    error: example_value
                    error_type: example_value
        '422':
          description: Unprocessable entity
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/ErrorResponse'
              examples:
                Chatcompletions422Example:
                  summary: Default chatCompletions 422 response
                  x-microcks-default: true
                  value:
                    error: example_value
                    error_type: example_value
        '429':
          description: Model overloaded
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/ErrorResponse'
              examples:
                Chatcompletions429Example:
                  summary: Default chatCompletions 429 response
                  x-microcks-default: true
                  value:
                    error: example_value
                    error_type: example_value
      x-microcks-operation:
        delay: 0
        dispatcher: FALLBACK
  /v1/completions:
    post:
      summary: Create Text Completion (openai Compatible)
      description: >-
        Create a text completion using the OpenAI-compatible completions API
        format.
      operationId: completions
      tags:
      - Text Generation
      requestBody:
        required: true
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/CompletionRequest'
            examples:
              CompletionsRequestExample:
                summary: Default completions request
                x-microcks-default: true
                value:
                  model: example_value
                  prompt: example_value
                  max_tokens: 10
                  temperature: 42.5
                  top_p: 42.5
                  stop:
                  - example_value
                  stream: true
                  seed: 10
                  suffix: example_value
      responses:
        '200':
          description: Completion response
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/CompletionResponse'
              examples:
                Completions200Example:
                  summary: Default completions 200 response
                  x-microcks-default: true
                  value:
                    id: abc123
                    object: example_value
                    created: 10
                    model: example_value
                    choices:
                    - text: example_value
                      index: 10
                      finish_reason: example_value
                    usage:
                      prompt_tokens: 10
                      completion_tokens: 10
                      total_tokens: 10
            text/event-stream:
              schema:
                $ref: '#/components/schemas/CompletionStreamResponse'
              examples:
                Completions200Example:
                  summary: Default completions 200 response
                  x-microcks-default: true
                  value:
                    id: abc123
                    object: example_value
                    created: 10
                    model: example_value
                    choices:
                    - text: example_value
                      index: 10
                      finish_reason: example_value
      x-microcks-operation:
        delay: 0
        dispatcher: FALLBACK
  /v1/models:
    get:
      summary: List Available Models
      description: Get information about the model currently loaded on this TGI server.
      operationId: listModels
      tags:
      - Info
      security: []
      responses:
        '200':
          description: Available models
          content:
            application/json:
              schema:
                type: object
                properties:
                  object:
                    type: string
                    const: list
                  data:
                    type: array
                    items:
                      type: object
                      properties:
                        id:
                          type: string
                          description: Model ID
                        object:
                          type: string
                          const: model
                        created:
                          type: integer
                        owned_by:
                          type: string
              examples:
                Listmodels200Example:
                  summary: Default listModels 200 response
                  x-microcks-default: true
                  value:
                    object: example_value
                    data:
                    - id: abc123
                      object: example_value
                      created: 10
                      owned_by: example_value
      x-microcks-operation:
        delay: 0
        dispatcher: FALLBACK
  /info:
    get:
      summary: Get Server Information
      description: >-
        Get metadata about the TGI server including the loaded model, version,
        SHA, Docker label, and model-specific parameters.
      operationId: getInfo
      tags:
      - Info
      security: []
      responses:
        '200':
          description: Server and model information
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/Info'
              examples:
                Getinfo200Example:
                  summary: Default getInfo 200 response
                  x-microcks-default: true
                  value:
                    model_id: '500123'
                    model_sha: example_value
                    model_dtype: float16
                    model_device_type: example_value
                    model_pipeline_tag: example_value
                    max_concurrent_requests: 10
                    max_best_of: 10
                    max_stop_sequences: 10
                    max_input_length: 10
                    max_total_tokens: 10
                    waiting_served_ratio: 42.5
                    max_batch_total_tokens: 10
                    max_waiting_tokens: 10
                    validation_workers: 10
                    max_client_batch_size: 10
                    version: example_value
                    sha: example_value
                    docker_label: example_value
      x-microcks-operation:
        delay: 0
        dispatcher: FALLBACK
  /health:
    get:
      summary: Health Check
      description: >-
        Check the health of the TGI server. Returns 200 if the model is loaded
        and ready to serve requests.
      operationId: healthCheck
      tags:
      - Info
      security: []
      responses:
        '200':
          description: Server is healthy and ready
        '503':
          description: Server is not ready (model still loading)
      x-microcks-operation:
        delay: 0
        dispatcher: FALLBACK
  /metrics:
    get:
      summary: Get Prometheus Metrics
      description: >-
        Get server metrics in Prometheus format including request counts,
        latencies, queue sizes, and token throughput.
      operationId: getMetrics
      tags:
      - Info
      security: []
      responses:
        '200':
          description: Prometheus-formatted metrics
          content:
            text/plain:
              schema:
                type: string
              examples:
                Getmetrics200Example:
                  summary: Default getMetrics 200 response
                  x-microcks-default: true
                  value: example_value
      x-microcks-operation:
        delay: 0
        dispatcher: FALLBACK
  /tokenize:
    post:
      summary: Tokenize Text
      description: >-
        Tokenize a string of text using the model's tokenizer. Returns the
        token IDs and their string representations.
      operationId: tokenize
      tags:
      - Info
      requestBody:
        required: true
        content:
          application/json:
            schema:
              type: object
              required:
              - inputs
              properties:
                inputs:
                  type: string
                  description: Text to tokenize
                add_special_tokens:
                  type: boolean
                  default: true
                  description: Whether to add special tokens
            examples:
              TokenizeRequestExample:
                summary: Default tokenize request
                x-microcks-default: true
                value:
                  inputs: example_value
                  add_special_tokens: true
      responses:
        '200':
          description: Tokenized result
          content:
            application/json:
              schema:
                type: array
                items:
                  type: object
                  properties:
                    id:
                      type: integer
                      description: Token ID
                    text:
                      type: string
                      description: Token text
                    start:
                      type: integer
                    stop:
                      type: integer
                    special:
                      type: boolean
              examples:
                Tokenize200Example:
                  summary: Default tokenize 200 response
                  x-microcks-default: true
                  value:
                  - id: abc123
                    text: example_value
                    start: 10
                    stop: 10
                    special: true
      x-microcks-operation:
        delay: 0
        dispatcher: FALLBACK
components:
  securitySchemes:
    bearerAuth:
      type: http
      scheme: bearer
      bearerFormat: HF Token
      description: >-
        Hugging Face API token or custom auth token for self-hosted instances.
  schemas:
    GenerateRequest:
      type: object
      required:
      - inputs
      properties:
        inputs:
          type: string
          description: Input prompt text
          example: What is deep learning?
        parameters:
          type: object
          properties:
            max_new_tokens:
              type: integer
              description: Maximum number of tokens to generate
              default: 20
              example: 100
            temperature:
              type: number
              format: float
              description: Sampling temperature (higher = more random)
              minimum: 0
              example: 0.7
            top_p:
              type: number
              format: float
              description: Nucleus sampling probability threshold
              minimum: 0
              maximum: 1
              example: 0.95
            top_k:
              type: integer
              description: Top-k sampling parameter
              minimum: 0
              example: 50
            repetition_penalty:
              type: number
              format: float
              description: Repetition penalty (1.0 = no penalty)
              example: 1.1
            do_sample:
              type: boolean
              description: Whether to use sampling vs greedy decoding
              default: false
            seed:
              type: integer
              description: Random seed for reproducibility
            stop:
              type: array
              items:
                type: string
              description: Stop sequences
            watermark:
              type: boolean
              description: Whether to add a watermark to generated text
              default: false
            return_full_text:
              type: boolean
              description: Include the input prompt in the response
              default: false
            decoder_input_details:
              type: boolean
              description: Return decoder input token details
              default: false
            details:
              type: boolean
              description: Return generation details (tokens, logprobs, etc.)
              default: false
            truncate:
              type: integer
              description: Truncate input to this many tokens
            typical_p:
              type: number
              format: float
              description: Typical decoding probability threshold
            best_of:
              type: integer
              description: Generate this many sequences and return the best
              minimum: 1
            grammar:
              type: object
              description: Grammar constraints for generation
              properties:
                type:
                  type: string
                  enum:
                  - json
                  - regex
                value:
                  type: string
                  description: Grammar specification (JSON schema or regex)
          example: example_value
    GenerateResponse:
      type: object
      properties:
        generated_text:
          type: string
          description: The generated text
          example: example_value
        details:
          type: object
          properties:
            finish_reason:
              type: string
              enum:
              - length
              - eos_token
              - stop_sequence
              description: Reason generation stopped
            generated_tokens:
              type: integer
              description: Number of generated tokens
            seed:
              type: integer
              description: Sampling seed used
            prefill:
              type: array
              items:
                type: object
                properties:
                  id:
                    type: integer
                  text:
                    type: string
                  logprob:
                    type: number
              description: Prefill token details
            tokens:
              type: array
              items:
                type: object
                properties:
                  id:
                    type: integer
                  text:
                    type: string
                  logprob:
                    type: number
                  special:
                    type: boolean
              description: Generated token details
            best_of_sequences:
              type: array
              items:
                type: object
              description: Best-of sequences details
          example: example_value
    StreamResponse:
      type: object
      properties:
        token:
          type: object
          properties:
            id:
              type: integer
            text:
              type: string
            logprob:
              type: number
            special:
              type: boolean
          example: example_value
        generated_text:
          type: string
          description: Full generated text (only in the last event)
          example: example_value
        details:
          type: object
          description: Generation details (only in the last event)
          properties:
            finish_reason:
              type: string
            generated_tokens:
              type: integer
            seed:
              type: integer
          example: example_value
    ChatCompletionRequest:
      type: object
      required:
      - model
      - messages
      properties:
        model:
          type: string
          description: Model identifier (can be tgi or a model ID)
          example: tgi
        messages:
          type: array
          description: Conversation messages
          items:
            type: object
            required:
            - role
            properties:
              role:
                type: string
                enum:
                - system
                - user
                - assistant
                - tool
              content:
                oneOf:
                - type: string
                - type: array
                  items:
                    oneOf:
                    - type: object
                      required:
                      - type
                      - text
                      properties:
                        type:
                          type: string
                          const: text
                        text:
                          type: string
                    - type: object
                      required:
                      - type
                      - image_url
                      properties:
                        type:
                          type: string
                          const: image_url
                        image_url:
                          type: object
                          required:
                          - url
                          properties:
                            url:
                              type: string
              tool_calls:
                type: array
                items:
                  type: object
                  required:
                  - id
                  - type
                  - function
                  properties:
                    id:
                      type: string
                    type:
                      type: string
                    function:
                      type: object
                      required:
                      - name
                      properties:
                        name:
                          type: string
                        arguments:
                          type: string
                        description:
                          type: string
              tool_call_id:
                type: string
              name:
                type: string
          example: []
        frequency_penalty:
          type: number
          minimum: -2.0
          maximum: 2.0
          example: 42.5
        logprobs:
          type: boolean
          example: true
        max_tokens:
          type: integer
          description: Maximum tokens to generate
          example: 10
        presence_penalty:
          type: number
          minimum: -2.0
          maximum: 2.0
          example: 42.5
        response_format:
          oneOf:
          - type: object
            properties:
              type:
                type: string
                const: text
          - type: object
            required:
            - type
            - json_schema
            properties:
              type:
                type: string
                const: json_schema
              

# --- truncated at 32 KB (43 KB total) ---
# Full source: https://raw.githubusercontent.com/api-evangelist/hugging-face/refs/heads/main/openapi/hugging-face-text-generation-inference-api.yml