openapi: 3.1.0
info:
title: Google Gemini API
description: 'The Gemini API can generate text output from various inputs, including text, images, video, and audio, leveraging Gemini models.
'
version: 1.0.0
contact:
name: Google AI
url: https://ai.google.dev
servers:
- url: https://generativelanguage.googleapis.com/v1beta
description: Gemini API Server
security:
- ApiKeyHeader: []
tags:
- name: Audio Understanding
description: Operations related to Audio Understanding
- name: Content Generation
description: Operations related to Content Generation
- name: Document Understanding
description: Operations related to Document Understanding
- name: File Api
description: Operations related to File Api
- name: Image Generation
description: Operations related to Image Generation
- name: Speech Generation
description: Operations related to Speech Generation
- name: Streaming
description: Operations related to Streaming
- name: Video Understanding
description: Operations related to Video Understanding
paths:
/files:
post:
tags:
- Audio Understanding
- Document Understanding
- File Api
- Video Understanding
summary: Google Upload File (Resumable)
description: "Upload a video, PDF, audio, or other media file using resumable upload protocol. Use this for files larger than 20MB, \nvideos longer than approximately 1 minute, or when you want to reuse the file across multiple requests.\n"
operationId: uploadFile
parameters:
- name: X-Goog-Upload-Protocol
in: header
required: true
schema:
type: string
enum:
- resumable
description: Upload protocol type
- name: X-Goog-Upload-Command
in: header
required: true
schema:
type: string
enum:
- start
- upload, finalize
description: Upload command (start for initiation, "upload, finalize" for data upload)
- name: X-Goog-Upload-Header-Content-Length
in: header
required: false
schema:
type: integer
description: Total size of the file in bytes (required for start command)
- name: X-Goog-Upload-Header-Content-Type
in: header
required: false
schema:
type: string
description: MIME type of the file (required for start command)
- name: X-Goog-Upload-Offset
in: header
required: false
schema:
type: integer
description: Byte offset for upload (required for upload command)
- name: Content-Length
in: header
required: false
schema:
type: integer
description: Size of the current chunk being uploaded
requestBody:
content:
application/json:
schema:
$ref: '#/components/schemas/FilesPostRequest'
application/octet-stream:
schema:
$ref: '#/components/schemas/FilesPostRequest1'
responses:
'200':
description: Upload initiated or completed successfully
headers:
x-goog-upload-url:
description: URL for uploading file data (returned on start command)
schema:
type: string
content:
application/json:
schema:
$ref: '#/components/schemas/FileUploadResponse'
'400':
description: Bad request
content:
application/json:
schema:
$ref: '#/components/schemas/Error'
'401':
description: Unauthorized
content:
application/json:
schema:
$ref: '#/components/schemas/Error'
x-microcks-operation:
delay: 100
dispatcher: FALLBACK
dispatcherRules: SuccessExample
/files/{name}:
get:
tags:
- File Api
summary: Google Get File Metadata
description: "Retrieve metadata for an uploaded file. Use this to verify the file was successfully \nstored and to check its processing state.\n"
operationId: getFile
parameters:
- name: name
in: path
required: true
description: The resource name of the file (e.g., "files/abc123")
schema:
type: string
example: files/abc123
responses:
'200':
description: File metadata retrieved successfully
content:
application/json:
schema:
$ref: '#/components/schemas/FileUploadResponse'
'404':
description: File not found
content:
application/json:
schema:
$ref: '#/components/schemas/Error'
'401':
description: Unauthorized
content:
application/json:
schema:
$ref: '#/components/schemas/Error'
x-microcks-operation:
delay: 100
dispatcher: FALLBACK
dispatcherRules: SuccessExample
/models/{model}:countTokens:
post:
tags:
- Audio Understanding
- Content Generation
summary: Google Count Tokens
description: 'Get a count of the number of tokens in content, including audio, video, images, and text.
Useful for estimating costs and ensuring content fits within model context windows.
'
operationId: countTokens
parameters:
- name: model
in: path
required: true
description: The model to use for token counting
schema:
type: string
example: gemini-2.5-flash
requestBody:
required: true
content:
application/json:
schema:
$ref: '#/components/schemas/ModelsPostRequest'
examples:
ModelsPostRequestExample:
$ref: '#/components/examples/ModelsPostRequestExample'
responses:
'200':
description: Token count retrieved successfully
content:
application/json:
schema:
$ref: '#/components/schemas/CountTokensResponse'
'400':
description: Bad request
content:
application/json:
schema:
$ref: '#/components/schemas/Error'
'401':
description: Unauthorized
content:
application/json:
schema:
$ref: '#/components/schemas/Error'
x-microcks-operation:
delay: 100
dispatcher: FALLBACK
dispatcherRules: SuccessExample
/models/{model}:generateContent:
post:
tags:
- Audio Understanding
- Content Generation
- Document Understanding
- Video Understanding
summary: Google Generate Content
description: "Generates text output from various inputs including text, images, video, audio, and PDF documents.\n\n**Document Understanding**: Process PDF documents up to 1000 pages using native vision to:\n- Analyze and interpret text, images, diagrams, charts, and tables\n- Extract information into structured output formats\n- Summarize and answer questions based on visual and textual elements\n- Transcribe document content (e.g., to HTML) preserving layouts and formatting\n- Process multiple PDFs in a single request (within context window limits)\n\n**Video Understanding**: Process videos to describe, segment, and extract information, answer questions, \nand refer to specific timestamps.\n\n**Audio Understanding**: Analyze and understand audio input to:\n- Transcribe speech to text with timestamps\n- Translate audio content\n- Detect and label different speakers (speaker diarization)\n- Detect emotion in speech and music\n- Analyze specific segments and provide timestamps\n- Describe, summarize, or answer questions about audio content\n\n**Image Input**: Combine text with images for multimodal understanding.\n"
operationId: generateContent
parameters:
- name: model
in: path
required: true
description: The model to use for generation
schema:
type: string
enum:
- gemini-2.5-flash
- gemini-2.5-pro
example: gemini-2.5-flash
requestBody:
required: true
content:
application/json:
schema:
$ref: '#/components/schemas/GenerateContentRequest'
responses:
'200':
description: Successful response
content:
application/json:
schema:
$ref: '#/components/schemas/GenerateContentResponse'
'400':
description: Bad request
content:
application/json:
schema:
$ref: '#/components/schemas/Error'
'401':
description: Unauthorized - Invalid API key
content:
application/json:
schema:
$ref: '#/components/schemas/Error'
'429':
description: Too many requests
content:
application/json:
schema:
$ref: '#/components/schemas/Error'
x-microcks-operation:
delay: 100
dispatcher: FALLBACK
dispatcherRules: SuccessExample
/models/{ttsModel}:generateContent:
post:
tags:
- Speech Generation
summary: Google Generate Speech (Text-to-Speech)
description: "Transform text input into single-speaker or multi-speaker audio using native text-to-speech (TTS) \ngeneration capabilities. TTS is controllable through natural language to guide style, accent, pace, \nand tone of the audio.\n\n**Capabilities:**\n- Single-speaker or multi-speaker audio (up to 2 speakers)\n- 30 voice options with different characteristics (bright, upbeat, informative, etc.)\n- 24 supported languages with automatic language detection\n- Controllable style, tone, accent, and pace via prompts\n- Audio output in PCM format (24kHz, 16-bit, mono)\n\n**Note**: TTS models accept text-only inputs and produce audio-only outputs.\n"
operationId: generateSpeech
parameters:
- name: ttsModel
in: path
required: true
description: The TTS model to use for speech generation
schema:
type: string
enum:
- gemini-2.5-flash-preview-tts
- gemini-2.5-pro-preview-tts
example: gemini-2.5-flash-preview-tts
requestBody:
required: true
content:
application/json:
schema:
$ref: '#/components/schemas/SpeechGenerationRequest'
responses:
'200':
description: Successful response with generated audio
content:
application/json:
schema:
$ref: '#/components/schemas/SpeechGenerationResponse'
'400':
description: Bad request
content:
application/json:
schema:
$ref: '#/components/schemas/Error'
'401':
description: Unauthorized - Invalid API key
content:
application/json:
schema:
$ref: '#/components/schemas/Error'
'429':
description: Too many requests
content:
application/json:
schema:
$ref: '#/components/schemas/Error'
x-microcks-operation:
delay: 100
dispatcher: FALLBACK
dispatcherRules: SuccessExample
/models/{model}:streamGenerateContent:
post:
tags:
- Content Generation
- Streaming
summary: Google Stream Generate Content
description: Generates text output with streaming responses, receiving GenerateContentResponse instances incrementally
operationId: streamGenerateContent
parameters:
- name: model
in: path
required: true
description: The model to use for generation
schema:
type: string
enum:
- gemini-2.5-flash
- gemini-2.5-pro
example: gemini-2.5-flash
- name: alt
in: query
required: true
description: Alternative response format
schema:
type: string
enum:
- sse
example: sse
requestBody:
required: true
content:
application/json:
schema:
$ref: '#/components/schemas/GenerateContentRequest'
responses:
'200':
description: Successful streaming response
content:
text/event-stream:
schema:
$ref: '#/components/schemas/ModelsPostResponse'
examples:
ModelsPostResponseExample:
$ref: '#/components/examples/ModelsPostResponseExample'
'400':
description: Bad request
content:
application/json:
schema:
$ref: '#/components/schemas/Error'
'401':
description: Unauthorized - Invalid API key
content:
application/json:
schema:
$ref: '#/components/schemas/Error'
x-microcks-operation:
delay: 100
dispatcher: FALLBACK
dispatcherRules: ModelsPostResponseExample
/models/{imageModel}:generateContent:
post:
tags:
- Image Generation
summary: Google Generate Images
description: "Generate and process images conversationally. Supports text-to-image, text-and-image-to-image, \nand multi-turn image editing. All generated images include a SynthID watermark.\n"
operationId: generateImage
parameters:
- name: imageModel
in: path
required: true
description: The image generation model to use
schema:
type: string
enum:
- gemini-2.5-flash-image
- gemini-3-pro-image-preview
example: gemini-2.5-flash-image
requestBody:
required: true
content:
application/json:
schema:
$ref: '#/components/schemas/ImageGenerationRequest'
responses:
'200':
description: Successful response with generated image
content:
application/json:
schema:
$ref: '#/components/schemas/ImageGenerationResponse'
'400':
description: Bad request
content:
application/json:
schema:
$ref: '#/components/schemas/Error'
'401':
description: Unauthorized - Invalid API key
content:
application/json:
schema:
$ref: '#/components/schemas/Error'
'429':
description: Too many requests
content:
application/json:
schema:
$ref: '#/components/schemas/Error'
x-microcks-operation:
delay: 100
dispatcher: FALLBACK
dispatcherRules: SuccessExample
components:
schemas:
AudioCandidate:
type: object
properties:
content:
$ref: '#/components/schemas/AudioContent'
description: Generated audio content
finishReason:
type: string
enum:
- FINISH_REASON_UNSPECIFIED
- STOP
- MAX_TOKENS
- SAFETY
- RECITATION
- OTHER
description: Reason why generation stopped
index:
type: integer
description: Index of the candidate
AudioContent:
type: object
properties:
parts:
type: array
description: Audio data parts
items:
type: object
properties:
inlineData:
type: object
required:
- mimeType
- data
properties:
mimeType:
type: string
description: MIME type of the audio (PCM format)
example: audio/pcm
data:
type: string
format: byte
description: Base64-encoded audio data (24kHz, 16-bit, mono PCM)
Candidate:
type: object
properties:
content:
$ref: '#/components/schemas/Content'
description: Generated content
finishReason:
type: string
enum:
- FINISH_REASON_UNSPECIFIED
- STOP
- MAX_TOKENS
- SAFETY
- RECITATION
- OTHER
description: Reason why generation stopped
index:
type: integer
description: Index of the candidate
safetyRatings:
type: array
items:
$ref: '#/components/schemas/SafetyRating'
description: Safety ratings for the generated content
Content:
type: object
properties:
role:
type: string
enum:
- user
- model
description: The role of the content producer (optional for single-turn, required for multi-turn)
parts:
type: array
description: Ordered parts that constitute a single message
items:
$ref: '#/components/schemas/Part'
required:
- parts
CountTokensResponse:
type: object
properties:
totalTokens:
type: integer
description: Total number of tokens in the provided content
totalBillableCharacters:
type: integer
description: Total billable characters (for audio/video content)
Error:
type: object
properties:
error:
type: object
properties:
code:
type: integer
description: HTTP status code
message:
type: string
description: Error message
status:
type: string
description: Error status
FileData:
type: object
description: Reference to a file uploaded via the File API or a YouTube URL
properties:
mime_type:
type: string
description: MIME type of the file (optional for YouTube URLs)
example: video/mp4
file_uri:
type: string
description: URI of the file from File API or YouTube URL
examples:
- https://generativelanguage.googleapis.com/v1beta/files/abc123
- https://www.youtube.com/watch?v=9hE5-98ZeCg
required:
- file_uri
FileUploadResponse:
type: object
properties:
file:
type: object
properties:
name:
type: string
description: Resource name of the file
example: files/abc123
display_name:
type: string
description: Display name of the file
mime_type:
type: string
description: MIME type of the file
example: video/mp4
size_bytes:
type: string
description: Size of the file in bytes
create_time:
type: string
format: date-time
description: Creation timestamp
update_time:
type: string
format: date-time
description: Last update timestamp
expiration_time:
type: string
format: date-time
description: Expiration timestamp
sha256_hash:
type: string
description: SHA-256 hash of the file
uri:
type: string
description: URI to use in generateContent requests
example: https://generativelanguage.googleapis.com/v1beta/files/abc123
state:
type: string
enum:
- STATE_UNSPECIFIED
- PROCESSING
- ACTIVE
- FAILED
description: Processing state of the file
error:
type: object
description: Error details if state is FAILED
properties:
message:
type: string
FilesPostRequest:
description: FilesPostRequest schema
type: object
properties:
file:
$ref: '#/components/schemas/FilesPostRequestFile'
FilesPostRequest1:
description: FilesPostRequest1 schema
type: string
format: binary
FilesPostRequestFile:
description: FilesPostRequestFile schema
type: object
properties:
display_name:
type: string
description: Display name for the file
GenerateContentRequest:
type: object
required:
- contents
properties:
contents:
type: array
description: The content of the current conversation with the model
items:
$ref: '#/components/schemas/Content'
system_instruction:
$ref: '#/components/schemas/Content'
description: System instructions to guide model behavior
generationConfig:
$ref: '#/components/schemas/GenerationConfig'
description: Configuration options for model generation
GenerateContentResponse:
type: object
properties:
candidates:
type: array
description: Generated content candidates
items:
$ref: '#/components/schemas/Candidate'
promptFeedback:
$ref: '#/components/schemas/PromptFeedback'
description: Feedback about the prompt
usageMetadata:
$ref: '#/components/schemas/UsageMetadata'
description: Token usage information
GenerationConfig:
type: object
description: Configuration options for model generation and outputs
properties:
stopSequences:
type: array
description: Set of character sequences that will stop output generation
items:
type: string
maxItems: 5
temperature:
type: number
format: float
description: Controls randomness of output (default 1.0 for Gemini 3 models)
minimum: 0.0
maximum: 2.0
default: 1.0
topP:
type: number
format: float
description: Maximum cumulative probability of tokens to consider
minimum: 0.0
maximum: 1.0
topK:
type: integer
description: Maximum number of tokens to consider
minimum: 1
maxOutputTokens:
type: integer
description: Maximum number of tokens to generate
minimum: 1
thinkingConfig:
$ref: '#/components/schemas/ThinkingConfig'
description: Configuration for thinking mode (Gemini 2.5 models)
response_mime_type:
type: string
description: MIME type for structured output (e.g., application/json)
enum:
- text/plain
- application/json
example: application/json
response_schema:
type: object
description: JSON schema for structured output format
additionalProperties: true
ImageCandidate:
type: object
properties:
content:
$ref: '#/components/schemas/ImageContent'
description: Generated content including text and/or images
finishReason:
type: string
enum:
- FINISH_REASON_UNSPECIFIED
- STOP
- MAX_TOKENS
- SAFETY
- RECITATION
- OTHER
description: Reason why generation stopped
index:
type: integer
description: Index of the candidate
safetyRatings:
type: array
items:
$ref: '#/components/schemas/SafetyRating'
description: Safety ratings for the generated content
ImageConfig:
type: object
description: Configuration for image generation parameters
properties:
aspectRatio:
type: string
description: Aspect ratio for generated images
enum:
- '1:1'
- '3:4'
- '4:3'
- '9:16'
- '16:9'
example: '16:9'
imageSize:
type: string
description: Size of generated images
enum:
- 1K
- 2K
- 4K
example: 2K
ImageContent:
type: object
properties:
role:
type: string
enum:
- user
- model
description: The role of the content producer
parts:
type: array
description: Ordered parts that constitute a single message (can include text and inline image data)
items:
$ref: '#/components/schemas/Part'
required:
- parts
ImageGenerationConfig:
type: object
description: Configuration options for image generation
properties:
responseModalities:
type: array
description: Modalities to include in the response
items:
type: string
enum:
- TEXT
- IMAGE
example:
- TEXT
- IMAGE
imageConfig:
$ref: '#/components/schemas/ImageConfig'
description: Configuration for image generation
ImageGenerationRequest:
type: object
required:
- contents
properties:
contents:
type: array
description: The content of the current conversation with the model
items:
$ref: '#/components/schemas/Content'
generationConfig:
$ref: '#/components/schemas/ImageGenerationConfig'
description: Configuration options for image generation
ImageGenerationResponse:
type: object
properties:
candidates:
type: array
description: Generated content candidates with images
items:
$ref: '#/components/schemas/ImageCandidate'
promptFeedback:
$ref: '#/components/schemas/PromptFeedback'
description: Feedback about the prompt
usageMetadata:
$ref: '#/components/schemas/UsageMetadata'
description: Token usage information
InlineData:
type: object
required:
- mime_type
- data
properties:
mime_type:
type: string
description: MIME type of the data
enum:
- image/jpeg
- image/png
- image/webp
- video/mp4
- video/mpeg
- video/mov
- video/avi
- video/x-flv
- video/mpg
- video/webm
- video/wmv
- video/3gpp
- audio/wav
- audio/mp3
- audio/aiff
- audio/aac
- audio/ogg
- audio/flac
- application/pdf
data:
type: string
format: byte
description: Base64 encoded data
ModelsPostRequest:
description: ModelsPostRequest schema
type: object
required:
- contents
properties:
contents:
type: array
description: The content to count tokens for
items:
$ref: '#/components/schemas/Content'
ModelsPostResponse:
type: string
description: Server-Sent Events stream of GenerateContentResponse objects
MultiSpeakerVoiceConfig:
type: object
required:
- speakerVoiceConfigs
properties:
speakerVoiceConfigs:
type: array
description: Voice configurations for each speaker (maximum 2)
minItems: 2
maxItems: 2
items:
$ref: '#/components/schemas/SpeakerVoiceConfig'
Part:
type: object
description: A part of the content, can be text, inline data, or file data
oneOf:
- type: object
required:
- text
properties:
text:
type: string
description: Text content
- type: object
required:
- inline_data
properties:
inline_data:
$ref: '#/components/schemas/InlineData'
- type: object
required:
- file_data
properties:
file_data:
$ref: '#/components/schemas/FileData'
video_metadata:
$ref: '#/components/schemas/VideoMetadata'
description: Optional metadata for video processing (clipping, FPS)
PrebuiltVoiceConfig:
type: object
required:
- voiceName
properties:
voiceName:
type: string
description: Name of the prebuilt voice with characteristic (Bright, Upbeat, Informative, etc.)
enum:
- Zephyr
- Puck
- Charon
- Kore
- Fenrir
- Leda
- Orus
- Aoede
- Callirrhoe
- Autonoe
- Enceladus
- Iapetus
- Umbriel
- Algieba
- Despina
- Erinome
- Algenib
- Rasalgethi
- Laomedeia
- Achernar
- Alnilam
- Schedar
- Gacrux
- Pulcherrima
- Achird
- Zubenelgenubi
- Vindemiatrix
- Sadachbia
- Sadaltager
- Sulafat
example: Kore
PromptFeedback:
type: object
properties:
blockReason:
type: string
enum:
- BLOCK_REASON_UNSPECIFIED
- SAFETY
- OTHER
description: Reason why prompt was blocked
safetyRatings:
type: array
items:
$ref: '#/components/schemas/SafetyRating'
description: Safety ratings for the prompt
SafetyRating:
type: object
properties:
category:
type: string
enum:
- HARM_CATEGORY_UNSPECIFIED
- HARM_CATEGORY_DEROGATORY
- HARM_CATEGORY_TOXICITY
- HARM_CATEGORY_VIOLENCE
- HARM_CATEGORY_SEXUAL
- HARM_CATEGORY_MEDICAL
- HARM_CATEGORY_DANGEROUS
- HARM_CATEGORY_HARASSMENT
- HARM_CATEGORY_HATE_SPEECH
- HARM_CATEGORY_SEXUALLY_EXPLICIT
- HARM_CATEGORY_DANGEROUS_CONTENT
probability:
type: string
enum:
- HARM_PROBABILITY_UNSPECIFIED
- NEGLIGIBLE
- LOW
- MEDIUM
- HIGH
SpeakerVoiceConfig:
type: object
required:
- speaker
- voiceConfig
properties:
speaker:
type: string
description: Name of the speaker (must match names used in prompt)
example: Joe
voiceConfig:
$ref: '#/components/schemas/VoiceConfig'
description: Voice configuration for this speaker
SpeechConfig:
type: object
description: Configuration for single or multi-speaker speech
oneOf:
- type: object
required:
- voiceConfig
properties:
voiceConfig:
$ref: '#/components/schemas/VoiceConfig'
description: Single-speaker voice configuration
- type: object
required:
- multiSpeakerVoiceConfig
properties:
multiSpeakerVoiceConfig:
$ref: '#/components/schemas/MultiSpeakerVoiceConfig'
description: Multi-speaker voice configuration (up to 2 speakers)
SpeechGenerationConfig:
type: object
required:
- responseModalities
- speechConfig
properties:
responseModalities:
type: array
# --- truncated at 32 KB (35 KB total) ---
# Full source: https://raw.githubusercontent.com/api-evangelist/google/refs/heads/main/openapi/google-gemini-api-openapi.yml