elevenlabs

ElevenLabs Voice Changer API

The ElevenLabs Voice Changer API performs speech-to-speech conversion, replacing one voice with another while preserving the original speech content, timing, and emotional delivery. Developers can transform audio recordings to sound like a different speaker using any voice from the ElevenLabs library or a custom cloned voice. This is useful for content creation, privacy protection, and character voice generation.

Documentation GitHub OpenAPI

Documentation

📖

Documentation

https://elevenlabs.io/docs/api-reference/speech-to-speech/convert

Specifications

⚙

OpenAPI

https://raw.githubusercontent.com/api-evangelist/elevenlabs/refs/heads/main/openapi/elevenlabs-voice-changer-openapi.yml

OpenAPI Specification

openapi: 3.1.0
info:
  title: ElevenLabs Voice Changer API
  description: >-
    The ElevenLabs Voice Changer API performs speech-to-speech conversion,
    replacing one voice with another while preserving the original speech
    content, timing, and emotional delivery. Developers can transform audio
    recordings to sound like a different speaker using any voice from the
    ElevenLabs library or a custom cloned voice.
  version: '1.0'
  contact:
    name: ElevenLabs Support
    url: https://help.elevenlabs.io
  termsOfService: https://elevenlabs.io/terms-of-service
externalDocs:
  description: ElevenLabs Voice Changer API Documentation
  url: https://elevenlabs.io/docs/api-reference/speech-to-speech/convert
servers:
  - url: https://api.elevenlabs.io
    description: Production Server
tags:
  - name: Speech to Speech
    description: >-
      Endpoints for converting speech from one voice to another while
      preserving the original speech characteristics.
security:
  - apiKeyAuth: []
paths:
  /v1/speech-to-speech/{voice_id}:
    post:
      operationId: convertVoice
      summary: Voice changer
      description: >-
        Converts an audio recording to use a different voice while preserving
        the original speech content, timing, and emotional delivery. The
        target voice can be any voice available in the user's library.
      tags:
        - Speech to Speech
      parameters:
        - $ref: '#/components/parameters/voiceId'
        - $ref: '#/components/parameters/outputFormat'
      requestBody:
        required: true
        content:
          multipart/form-data:
            schema:
              $ref: '#/components/schemas/SpeechToSpeechRequest'
      responses:
        '200':
          description: Voice conversion completed successfully
          content:
            audio/mpeg:
              schema:
                type: string
                format: binary
        '400':
          description: Bad request - invalid audio or parameters
        '401':
          description: Unauthorized - invalid or missing API key
        '422':
          description: Unprocessable entity - audio could not be processed
  /v1/speech-to-speech/{voice_id}/stream:
    post:
      operationId: streamConvertedVoice
      summary: Voice changer stream
      description: >-
        Converts an audio recording to use a different voice and streams the
        result using chunked transfer encoding. Useful for real-time
        processing and playback scenarios.
      tags:
        - Speech to Speech
      parameters:
        - $ref: '#/components/parameters/voiceId'
        - $ref: '#/components/parameters/outputFormat'
      requestBody:
        required: true
        content:
          multipart/form-data:
            schema:
              $ref: '#/components/schemas/SpeechToSpeechRequest'
      responses:
        '200':
          description: Streaming voice conversion response
          content:
            audio/mpeg:
              schema:
                type: string
                format: binary
        '400':
          description: Bad request - invalid audio or parameters
        '401':
          description: Unauthorized - invalid or missing API key
        '422':
          description: Unprocessable entity - audio could not be processed
components:
  securitySchemes:
    apiKeyAuth:
      type: apiKey
      in: header
      name: xi-api-key
      description: >-
        ElevenLabs API key passed in the xi-api-key header for authentication.
  parameters:
    voiceId:
      name: voice_id
      in: path
      required: true
      description: >-
        The identifier of the target voice to convert the audio to.
      schema:
        type: string
    outputFormat:
      name: output_format
      in: query
      required: false
      description: >-
        The desired output audio format.
      schema:
        type: string
        default: mp3_44100_128
        enum:
          - mp3_22050_32
          - mp3_44100_32
          - mp3_44100_64
          - mp3_44100_96
          - mp3_44100_128
          - mp3_44100_192
          - pcm_16000
          - pcm_22050
          - pcm_24000
          - pcm_44100
          - ulaw_8000
  schemas:
    SpeechToSpeechRequest:
      type: object
      required:
        - audio
      properties:
        audio:
          type: string
          format: binary
          description: >-
            The source audio file containing the speech to convert. Supports
            common audio formats including MP3, WAV, and OGG.
        model_id:
          type: string
          description: >-
            The identifier of the model to use for voice conversion.
        voice_settings:
          type: object
          description: >-
            Voice settings to override the default settings for the target
            voice.
          properties:
            stability:
              type: number
              description: >-
                Controls the stability of the converted voice output.
              minimum: 0
              maximum: 1
            similarity_boost:
              type: number
              description: >-
                Controls how closely the output matches the target voice.
              minimum: 0
              maximum: 1
            style:
              type: number
              description: >-
                Controls the expressiveness of the converted speech.
              minimum: 0
              maximum: 1
            use_speaker_boost:
              type: boolean
              description: >-
                Enables speaker boost for increased clarity.
        seed:
          type: integer
          description: >-
            A seed value for deterministic generation.