elevenlabs
ElevenLabs Text to Speech API

The ElevenLabs Text to Speech API converts text into lifelike spoken audio with nuanced intonation, pacing, and emotional awareness. It supports multiple output formats including MP3, PCM, and mu-law, and offers a range of models such as Flash v2.5 for ultra-low latency real-time applications and Multilingual v2 for support across 70+ languages. Developers can select from thousands of pre-built voices or use custom cloned voices to generate speech that sounds natural and expressive.
Documentation GitHub OpenAPI
Documentation

📖
Documentation
https://elevenlabs.io/docs/api-reference/text-to-speech/convert
Specifications

⚙
OpenAPI
https://raw.githubusercontent.com/api-evangelist/elevenlabs/refs/heads/main/openapi/elevenlabs-text-to-speech-openapi.yml
⚙
AsyncAPI
https://raw.githubusercontent.com/api-evangelist/elevenlabs/refs/heads/main/asyncapi/elevenlabs-text-to-speech-streaming-asyncapi.yml
OpenAPI Specification

openapi: 3.1.0
info:
  title: ElevenLabs Text to Speech API
  description: >-
    The ElevenLabs Text to Speech API converts text into lifelike spoken audio
    with nuanced intonation, pacing, and emotional awareness. It supports
    multiple output formats including MP3, PCM, and mu-law, and offers a range
    of models such as Eleven v3, Flash v2.5 for ultra-low latency real-time
    applications, and Multilingual v2 for support across 70+ languages.
    Developers can select from thousands of pre-built voices or use custom
    cloned voices to generate speech that sounds natural and expressive.
  version: '1.0'
  contact:
    name: ElevenLabs Support
    url: https://help.elevenlabs.io
  termsOfService: https://elevenlabs.io/terms-of-service
externalDocs:
  description: ElevenLabs Text to Speech API Documentation
  url: https://elevenlabs.io/docs/api-reference/text-to-speech/convert
servers:
  - url: https://api.elevenlabs.io
    description: Production Server
tags:
  - name: Text to Dialogue
    description: >-
      Endpoints for converting text scripts with multiple speakers into
      dialogue audio.
  - name: Text to Speech
    description: >-
      Endpoints for converting text into speech audio with configurable voice,
      model, and output format settings.
security:
  - apiKeyAuth: []
paths:
  /v1/text-to-speech/{voice_id}:
    post:
      operationId: createSpeech
      summary: Create speech
      description: >-
        Converts text into speech using a specified voice. Returns audio in the
        requested format. Supports voice settings overrides and pronunciation
        dictionary locators for fine-tuned output.
      tags:
        - Text to Speech
      parameters:
        - $ref: '#/components/parameters/voiceId'
        - $ref: '#/components/parameters/outputFormat'
      requestBody:
        required: true
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/TextToSpeechRequest'
      responses:
        '200':
          description: Audio file generated successfully
          content:
            audio/mpeg:
              schema:
                type: string
                format: binary
        '400':
          description: Bad request - invalid parameters
        '401':
          description: Unauthorized - invalid or missing API key
        '422':
          description: Unprocessable entity - validation error
  /v1/text-to-speech/{voice_id}/stream:
    post:
      operationId: streamSpeech
      summary: Stream speech
      description: >-
        Converts text into speech and streams the audio back as chunked
        transfer encoding. Useful for real-time playback scenarios where
        latency is important.
      tags:
        - Text to Speech
      parameters:
        - $ref: '#/components/parameters/voiceId'
        - $ref: '#/components/parameters/outputFormat'
      requestBody:
        required: true
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/TextToSpeechRequest'
      responses:
        '200':
          description: Streaming audio response
          content:
            audio/mpeg:
              schema:
                type: string
                format: binary
        '400':
          description: Bad request - invalid parameters
        '401':
          description: Unauthorized - invalid or missing API key
        '422':
          description: Unprocessable entity - validation error
  /v1/text-to-speech/{voice_id}/with-timestamps:
    post:
      operationId: createSpeechWithTimestamps
      summary: Create speech with timing
      description: >-
        Converts text into speech and returns both the audio and word-level
        timing information. Useful for applications that need to synchronize
        text display with audio playback.
      tags:
        - Text to Speech
      parameters:
        - $ref: '#/components/parameters/voiceId'
        - $ref: '#/components/parameters/outputFormat'
      requestBody:
        required: true
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/TextToSpeechRequest'
      responses:
        '200':
          description: Audio with timestamp data
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/TimestampedAudioResponse'
        '400':
          description: Bad request - invalid parameters
        '401':
          description: Unauthorized - invalid or missing API key
        '422':
          description: Unprocessable entity - validation error
  /v1/text-to-speech/{voice_id}/stream/with-timestamps:
    post:
      operationId: streamSpeechWithTimestamps
      summary: Stream speech with timing
      description: >-
        Converts text into speech and streams the audio along with word-level
        timing information. Combines the benefits of streaming delivery with
        timestamp synchronization data.
      tags:
        - Text to Speech
      parameters:
        - $ref: '#/components/parameters/voiceId'
        - $ref: '#/components/parameters/outputFormat'
      requestBody:
        required: true
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/TextToSpeechRequest'
      responses:
        '200':
          description: Streaming audio with timestamp data
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/TimestampedAudioResponse'
        '400':
          description: Bad request - invalid parameters
        '401':
          description: Unauthorized - invalid or missing API key
        '422':
          description: Unprocessable entity - validation error
  /v1/text-to-dialogue:
    post:
      operationId: createDialogue
      summary: Create dialogue
      description: >-
        Converts a dialogue script with multiple speakers into audio. Each
        segment of the script can be assigned a different voice, enabling
        multi-speaker audio generation from a single request.
      tags:
        - Text to Dialogue
      parameters:
        - $ref: '#/components/parameters/outputFormat'
      requestBody:
        required: true
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/TextToDialogueRequest'
      responses:
        '200':
          description: Dialogue audio generated successfully
          content:
            audio/mpeg:
              schema:
                type: string
                format: binary
        '400':
          description: Bad request - invalid parameters
        '401':
          description: Unauthorized - invalid or missing API key
        '422':
          description: Unprocessable entity - validation error
  /v1/text-to-dialogue/with-timestamps:
    post:
      operationId: createDialogueWithTimestamps
      summary: Create dialogue with timestamps
      description: >-
        Converts a dialogue script with multiple speakers into audio and
        returns word-level timing information alongside the generated audio.
      tags:
        - Text to Dialogue
      parameters:
        - $ref: '#/components/parameters/outputFormat'
      requestBody:
        required: true
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/TextToDialogueRequest'
      responses:
        '200':
          description: Dialogue audio with timestamp data
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/TimestampedAudioResponse'
        '400':
          description: Bad request - invalid parameters
        '401':
          description: Unauthorized - invalid or missing API key
        '422':
          description: Unprocessable entity - validation error
components:
  securitySchemes:
    apiKeyAuth:
      type: apiKey
      in: header
      name: xi-api-key
      description: >-
        ElevenLabs API key passed in the xi-api-key header for authentication.
  parameters:
    voiceId:
      name: voice_id
      in: path
      required: true
      description: >-
        The identifier of the voice to use for speech synthesis. Use the
        Voices API to list available voices.
      schema:
        type: string
    outputFormat:
      name: output_format
      in: query
      required: false
      description: >-
        The desired output audio format. Supported values include mp3_44100_128,
        mp3_44100_192, pcm_16000, pcm_22050, pcm_24000, pcm_44100, ulaw_8000.
      schema:
        type: string
        default: mp3_44100_128
        enum:
          - mp3_22050_32
          - mp3_44100_32
          - mp3_44100_64
          - mp3_44100_96
          - mp3_44100_128
          - mp3_44100_192
          - pcm_16000
          - pcm_22050
          - pcm_24000
          - pcm_44100
          - ulaw_8000
  schemas:
    TextToSpeechRequest:
      type: object
      required:
        - text
      properties:
        text:
          type: string
          description: >-
            The text to convert to speech. Maximum length varies by model.
        model_id:
          type: string
          description: >-
            The identifier of the model to use. Use GET /v1/models to list
            available models. The model must support text to speech.
          default: eleven_monolingual_v1
        voice_settings:
          $ref: '#/components/schemas/VoiceSettings'
        pronunciation_dictionary_locators:
          type: array
          description: >-
            A list of pronunciation dictionary locators to apply to the text.
            Applied in order, with a maximum of 3 locators per request.
          items:
            $ref: '#/components/schemas/PronunciationDictionaryLocator'
          maxItems: 3
        seed:
          type: integer
          description: >-
            A seed value for deterministic generation. Using the same seed
            with the same parameters produces the same audio output.
        previous_text:
          type: string
          description: >-
            Text that came before the current text for context continuity.
        next_text:
          type: string
          description: >-
            Text that comes after the current text for context continuity.
        language_code:
          type: string
          description: >-
            Language code for the text, in ISO 639-1 format. Helps the model
            produce more accurate pronunciation for the specified language.
    TextToDialogueRequest:
      type: object
      required:
        - segments
      properties:
        model_id:
          type: string
          description: >-
            The identifier of the model to use for dialogue generation.
        segments:
          type: array
          description: >-
            An array of dialogue segments, each with a speaker voice and text.
          items:
            $ref: '#/components/schemas/DialogueSegment'
    DialogueSegment:
      type: object
      required:
        - voice_id
        - text
      properties:
        voice_id:
          type: string
          description: >-
            The voice identifier for this segment of dialogue.
        text:
          type: string
          description: >-
            The text content for this segment of dialogue.
        voice_settings:
          $ref: '#/components/schemas/VoiceSettings'
    VoiceSettings:
      type: object
      description: >-
        Voice settings that override the stored settings for the given voice.
        Applied only on the current request.
      properties:
        stability:
          type: number
          description: >-
            Controls the stability of the generated voice. Higher values
            produce more consistent output, lower values add variability.
          minimum: 0
          maximum: 1
        similarity_boost:
          type: number
          description: >-
            Controls how closely the AI adheres to the original voice.
            Higher values increase similarity to the target voice.
          minimum: 0
          maximum: 1
        style:
          type: number
          description: >-
            Controls the expressiveness and style of the speech delivery.
            Higher values produce more expressive speech.
          minimum: 0
          maximum: 1
          default: 0
        use_speaker_boost:
          type: boolean
          description: >-
            Enables speaker boost to increase voice clarity and reduce
            background artifacts.
          default: true
    PronunciationDictionaryLocator:
      type: object
      required:
        - pronunciation_dictionary_id
        - version_id
      properties:
        pronunciation_dictionary_id:
          type: string
          description: >-
            The identifier of the pronunciation dictionary.
        version_id:
          type: string
          description: >-
            The version identifier of the pronunciation dictionary.
    TimestampedAudioResponse:
      type: object
      properties:
        audio_base64:
          type: string
          description: >-
            Base64 encoded audio data.
        alignment:
          type: object
          description: >-
            Word-level timing information for the generated audio.
          properties:
            characters:
              type: array
              description: >-
                Array of characters with their timing information.
              items:
                type: object
                properties:
                  character:
                    type: string
                    description: The character.
                  start_time:
                    type: number
                    description: Start time in seconds.
                  end_time:
                    type: number
                    description: End time in seconds.