International Business Machines

IBM Text to Speech API

The IBM Text to Speech API converts written text into natural-sounding speech in a variety of languages, dialects, and voices. It supports SSML input and multiple audio output formats for building voice-enabled applications.

Documentation GitHub OpenAPI

Documentation

📖

Documentation

https://cloud.ibm.com/apidocs/text-to-speech

Specifications

⚙

OpenAPI

https://raw.githubusercontent.com/api-evangelist/international-business-machines/refs/heads/main/openapi/ibm-text-to-speech-openapi.yml

OpenAPI Specification

openapi: 3.0.3
info:
  title: International Business Machines IBM Text to Speech API
  description: >-
    The IBM Text to Speech API provides speech-synthesis capabilities to
    convert written text into natural-sounding speech in a variety of languages,
    dialects, and voices. It supports SSML input and multiple audio formats.
  version: 1.0.0
  contact:
    name: IBM Cloud
    url: https://cloud.ibm.com/apidocs/text-to-speech
  license:
    name: IBM Cloud Terms
    url: https://www.ibm.com/terms
servers:
  - url: https://api.us-south.text-to-speech.watson.cloud.ibm.com
    description: US South (Dallas)
  - url: https://api.eu-de.text-to-speech.watson.cloud.ibm.com
    description: EU Central (Frankfurt)
paths:
  /v1/synthesize:
    post:
      operationId: synthesize
      summary: International Business Machines Synthesize audio
      description: >-
        Synthesize text to audio. Returns the audio in the specified format.
      tags:
        - Synthesis
      parameters:
        - name: Accept
          in: header
          schema:
            type: string
            enum:
              - audio/wav
              - audio/mp3
              - audio/ogg
              - audio/flac
        - name: voice
          in: query
          schema:
            type: string
      requestBody:
        required: true
        content:
          application/json:
            schema:
              type: object
              required:
                - text
              properties:
                text:
                  type: string
      responses:
        '200':
          description: Successful synthesis
          content:
            audio/wav:
              schema:
                type: string
                format: binary
  /v1/voices:
    get:
      operationId: listVoices
      summary: International Business Machines List voices
      description: List all available voices.
      tags:
        - Voices
      responses:
        '200':
          description: Successful response
          content:
            application/json:
              schema:
                type: object
                properties:
                  voices:
                    type: array
                    items:
                      $ref: '#/components/schemas/Voice'
  /v1/voices/{voice}:
    get:
      operationId: getVoice
      summary: International Business Machines Get a voice
      description: Get information about a specific voice.
      tags:
        - Voices
      parameters:
        - name: voice
          in: path
          required: true
          schema:
            type: string
      responses:
        '200':
          description: Successful response
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/Voice'
  /v1/pronunciation:
    get:
      operationId: getPronunciation
      summary: International Business Machines Get pronunciation
      description: Get the phonetic pronunciation of a word.
      tags:
        - Pronunciation
      parameters:
        - name: text
          in: query
          required: true
          schema:
            type: string
        - name: voice
          in: query
          schema:
            type: string
        - name: format
          in: query
          schema:
            type: string
            enum:
              - ipa
              - ibm
      responses:
        '200':
          description: Successful response
          content:
            application/json:
              schema:
                type: object
                properties:
                  pronunciation:
                    type: string
components:
  securitySchemes:
    bearerAuth:
      type: http
      scheme: bearer
      bearerFormat: IAM Token
  schemas:
    Voice:
      type: object
      properties:
        name:
          type: string
        language:
          type: string
        gender:
          type: string
        description:
          type: string
        url:
          type: string
        customizable:
          type: boolean
security:
  - bearerAuth: []
tags:
  - name: Pronunciation
    description: Get word pronunciations.
  - name: Synthesis
    description: Synthesize text to audio.
  - name: Voices
    description: Manage available voices.