IBM Speech to Text API

The IBM Speech to Text API provides speech-recognition capabilities to produce transcripts of spoken audio. It supports multiple languages and audio formats with features including speaker labels, keyword spotting, smart formatting, and custom language and acoustic models.

OpenAPI Specification

ibm-speech-to-text-openapi.yml Raw ↑
openapi: 3.0.3
info:
  title: International Business Machines IBM Speech to Text API
  description: >-
    The IBM Speech to Text API provides speech-recognition capabilities to
    produce transcripts of spoken audio. It supports multiple languages and
    audio formats, and offers features such as speaker labels, keyword spotting,
    smart formatting, and custom language and acoustic models.
  version: 1.0.0
  contact:
    name: IBM Cloud
    url: https://cloud.ibm.com/apidocs/speech-to-text
  license:
    name: IBM Cloud Terms
    url: https://www.ibm.com/terms
servers:
  - url: https://api.us-south.speech-to-text.watson.cloud.ibm.com
    description: US South (Dallas)
  - url: https://api.eu-de.speech-to-text.watson.cloud.ibm.com
    description: EU Central (Frankfurt)
paths:
  /v1/recognize:
    post:
      operationId: recognize
      summary: International Business Machines Recognize audio
      description: >-
        Send audio and receive a transcription of the audio content.
      tags:
        - Recognition
      parameters:
        - name: model
          in: query
          schema:
            type: string
          description: The model to use for recognition.
        - name: content-type
          in: header
          required: true
          schema:
            type: string
      requestBody:
        required: true
        content:
          audio/flac:
            schema:
              type: string
              format: binary
          audio/wav:
            schema:
              type: string
              format: binary
          audio/mp3:
            schema:
              type: string
              format: binary
      responses:
        '200':
          description: Successful transcription
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/SpeechRecognitionResults'
  /v1/models:
    get:
      operationId: listModels
      summary: International Business Machines List models
      description: List all available language models.
      tags:
        - Models
      responses:
        '200':
          description: Successful response
          content:
            application/json:
              schema:
                type: object
                properties:
                  models:
                    type: array
                    items:
                      $ref: '#/components/schemas/SpeechModel'
  /v1/models/{model_id}:
    get:
      operationId: getModel
      summary: International Business Machines Get a model
      description: Get information about a specific language model.
      tags:
        - Models
      parameters:
        - name: model_id
          in: path
          required: true
          schema:
            type: string
      responses:
        '200':
          description: Successful response
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/SpeechModel'
  /v1/customizations:
    get:
      operationId: listLanguageModels
      summary: International Business Machines List custom language models
      description: List all custom language models.
      tags:
        - Custom Language Models
      responses:
        '200':
          description: Successful response
    post:
      operationId: createLanguageModel
      summary: International Business Machines Create a custom language model
      description: Create a new custom language model for a specified base model.
      tags:
        - Custom Language Models
      requestBody:
        required: true
        content:
          application/json:
            schema:
              type: object
              required:
                - name
                - base_model_name
              properties:
                name:
                  type: string
                base_model_name:
                  type: string
                description:
                  type: string
      responses:
        '201':
          description: Custom model created
components:
  securitySchemes:
    bearerAuth:
      type: http
      scheme: bearer
      bearerFormat: IAM Token
  schemas:
    SpeechRecognitionResults:
      type: object
      properties:
        results:
          type: array
          items:
            type: object
            properties:
              final:
                type: boolean
              alternatives:
                type: array
                items:
                  type: object
                  properties:
                    transcript:
                      type: string
                    confidence:
                      type: number
        result_index:
          type: integer
        speaker_labels:
          type: array
          items:
            type: object
            properties:
              from:
                type: number
              to:
                type: number
              speaker:
                type: integer
              confidence:
                type: number
    SpeechModel:
      type: object
      properties:
        name:
          type: string
        language:
          type: string
        rate:
          type: integer
        url:
          type: string
        description:
          type: string
security:
  - bearerAuth: []
tags:
  - name: Custom Language Models
    description: Create and manage custom language models.
  - name: Models
    description: Manage speech recognition models.
  - name: Recognition
    description: Recognize and transcribe audio.