swagger: '2.0'
info:
title: Microsoft Azure Image Analysis
version: '2023-10-01'
x-typespec-generated:
- emitter: '@azure-tools/typespec-autorest'
schemes:
- https
x-ms-parameterized-host:
hostTemplate: '{endpoint}/computervision'
useSchemePrefix: false
parameters:
- name: endpoint
in: path
description: |-
Azure AI Computer Vision endpoint (protocol and hostname, for example:
https://<resource-name>.cognitiveservices.azure.com).
required: true
type: string
format: uri
x-ms-skip-url-encoding: true
produces:
- application/json
consumes:
- application/json
security:
- ApiKeyAuth: []
- AadOauth2Auth:
- https://cognitiveservices.azure.com/.default
securityDefinitions:
AadOauth2Auth:
type: oauth2
description: The Azure Active Directory OAuth2 Flow
flow: accessCode
authorizationUrl: https://login.microsoftonline.com/common/oauth2/authorize
scopes:
https://cognitiveservices.azure.com/.default: ''
tokenUrl: https://login.microsoftonline.com/common/oauth2/token
ApiKeyAuth:
type: apiKey
name: Ocp-Apim-Subscription-Key
in: header
tags:
- name: Imageanalysis:analyze
paths:
/imageanalysis:analyze:
post:
operationId: microsoftAzureAnalyzefromimagedata
description: Performs a single Image Analysis operation
consumes:
- application/octet-stream
parameters:
- $ref: '#/parameters/Azure.Core.Foundations.ApiVersionParameter'
- name: features
in: query
description: >-
A list of visual features to analyze.
Seven visual features are supported: Caption, DenseCaptions, Read
(OCR), Tags, Objects, SmartCrops, and People.
At least one visual feature must be specified.
required: true
type: array
items:
type: string
enum:
- tags
- caption
- denseCaptions
- objects
- read
- smartCrops
- people
x-ms-enum:
name: VisualFeatures
modelAsString: true
values:
- name: tags
value: tags
description: >-
Extract content tags for thousands of recognizable objects,
living beings, scenery, and actions that appear in the
image.
- name: caption
value: caption
description: >-
Generate a human-readable caption sentence that describes
the content of the image.
- name: denseCaptions
value: denseCaptions
description: >-
Generate human-readable caption sentences for up to 10
different regions in the image, including one for the whole
image.
- name: objects
value: objects
description: >-
Object detection. This is similar to tags, but focused on
detecting physical objects in the image and returning their
location.
- name: read
value: read
description: >-
Extract printed or handwritten text from the image. Also
known as Optical Character Recognition (OCR).
- name: smartCrops
value: smartCrops
description: >-
Find representative sub-regions of the image for thumbnail
generation, at desired aspect ratios, with priority given to
detected faces.
- name: people
value: people
description: Detect people in the image and return their location.
collectionFormat: csv
minItems: 1
x-ms-client-name: visualFeatures
- name: language
in: query
description: >-
The desired language for result generation (a two-letter language
code).
If this option is not specified, the default value 'en' is used
(English).
See https://aka.ms/cv-languages for a list of supported languages.
required: false
type: string
default: en
minLength: 2
- name: gender-neutral-caption
in: query
description: >-
Boolean flag for enabling gender-neutral captioning for Caption and
Dense Captions features.
By default captions may contain gender terms (for example: 'man',
'woman', or 'boy', 'girl').
If you set this to "true", those will be replaced with
gender-neutral terms (for example: 'person' or 'child').
required: false
type: boolean
default: false
x-ms-client-name: genderNeutralCaption
- name: smartcrops-aspect-ratios
in: query
description: >-
A list of aspect ratios to use for smart cropping.
Aspect ratios are calculated by dividing the target crop width in
pixels by the height in pixels.
Supported values are between 0.75 and 1.8 (inclusive).
If this parameter is not specified, the service will return one crop
region with an aspect
ratio it sees fit between 0.5 and 2.0 (inclusive).
required: false
type: array
items:
type: number
format: float
collectionFormat: csv
x-ms-client-name: smartCropsAspectRatios
- name: model-version
in: query
description: >-
The version of cloud AI-model used for analysis.
The format is the following: 'latest' (default value) or
'YYYY-MM-DD' or 'YYYY-MM-DD-preview', where 'YYYY', 'MM', 'DD' are
the year, month and day associated with the model.
This is not commonly set, as the default always gives the latest AI
model with recent improvements.
If however you would like to make sure analysis results do not
change over time, set this value to a specific model version.
required: false
type: string
default: latest
minLength: 6
maxLength: 18
pattern: ^(latest|\d{4}-\d{2}-\d{2})(-preview)?$
x-ms-client-name: modelVersion
- name: imageData
in: body
description: The image to be analyzed
required: true
schema:
type: string
format: binary
responses:
'200':
description: The request has succeeded.
schema:
$ref: '#/definitions/ImageAnalysisResult'
default:
description: An unexpected error response.
schema:
$ref: '#/definitions/Azure.Core.Foundations.ErrorResponse'
headers:
x-ms-error-code:
type: string
description: String error code indicating what went wrong.
x-ms-examples:
AnalyzeFromImageData:
$ref: ./examples/AnalyzeFromImageData_MaximumSet.json
summary: Microsoft Azure Post Imageanalysis:analyze
tags:
- Imageanalysis:analyze
x-ms-paths:
/imageanalysis:analyze?_overload=analyzeFromUrl:
post:
operationId: AnalyzeFromUrl
description: Performs a single Image Analysis operation
parameters:
- $ref: '#/parameters/Azure.Core.Foundations.ApiVersionParameter'
- name: features
in: query
description: >-
A list of visual features to analyze.
Seven visual features are supported: Caption, DenseCaptions, Read
(OCR), Tags, Objects, SmartCrops, and People.
At least one visual feature must be specified.
required: true
type: array
items:
type: string
enum:
- tags
- caption
- denseCaptions
- objects
- read
- smartCrops
- people
x-ms-enum:
name: VisualFeatures
modelAsString: true
values:
- name: tags
value: tags
description: >-
Extract content tags for thousands of recognizable objects,
living beings, scenery, and actions that appear in the
image.
- name: caption
value: caption
description: >-
Generate a human-readable caption sentence that describes
the content of the image.
- name: denseCaptions
value: denseCaptions
description: >-
Generate human-readable caption sentences for up to 10
different regions in the image, including one for the whole
image.
- name: objects
value: objects
description: >-
Object detection. This is similar to tags, but focused on
detecting physical objects in the image and returning their
location.
- name: read
value: read
description: >-
Extract printed or handwritten text from the image. Also
known as Optical Character Recognition (OCR).
- name: smartCrops
value: smartCrops
description: >-
Find representative sub-regions of the image for thumbnail
generation, at desired aspect ratios, with priority given to
detected faces.
- name: people
value: people
description: Detect people in the image and return their location.
collectionFormat: csv
minItems: 1
x-ms-client-name: visualFeatures
- name: language
in: query
description: >-
The desired language for result generation (a two-letter language
code).
If this option is not specified, the default value 'en' is used
(English).
See https://aka.ms/cv-languages for a list of supported languages.
required: false
type: string
default: en
minLength: 2
- name: gender-neutral-caption
in: query
description: >-
Boolean flag for enabling gender-neutral captioning for Caption and
Dense Captions features.
By default captions may contain gender terms (for example: 'man',
'woman', or 'boy', 'girl').
If you set this to "true", those will be replaced with
gender-neutral terms (for example: 'person' or 'child').
required: false
type: boolean
default: false
x-ms-client-name: genderNeutralCaption
- name: smartcrops-aspect-ratios
in: query
description: >-
A list of aspect ratios to use for smart cropping.
Aspect ratios are calculated by dividing the target crop width in
pixels by the height in pixels.
Supported values are between 0.75 and 1.8 (inclusive).
If this parameter is not specified, the service will return one crop
region with an aspect
ratio it sees fit between 0.5 and 2.0 (inclusive).
required: false
type: array
items:
type: number
format: float
collectionFormat: csv
x-ms-client-name: smartCropsAspectRatios
- name: model-version
in: query
description: >-
The version of cloud AI-model used for analysis.
The format is the following: 'latest' (default value) or
'YYYY-MM-DD' or 'YYYY-MM-DD-preview', where 'YYYY', 'MM', 'DD' are
the year, month and day associated with the model.
This is not commonly set, as the default always gives the latest AI
model with recent improvements.
If however you would like to make sure analysis results do not
change over time, set this value to a specific model version.
required: false
type: string
default: latest
minLength: 6
maxLength: 18
pattern: ^(latest|\d{4}-\d{2}-\d{2})(-preview)?$
x-ms-client-name: modelVersion
- name: imageUrl
in: body
description: The image to be analyzed
required: true
schema:
$ref: '#/definitions/ImageUrl'
responses:
'200':
description: The request has succeeded.
schema:
$ref: '#/definitions/ImageAnalysisResult'
default:
description: An unexpected error response.
schema:
$ref: '#/definitions/Azure.Core.Foundations.ErrorResponse'
headers:
x-ms-error-code:
type: string
description: String error code indicating what went wrong.
x-ms-examples:
AnalyzeFromUrl:
$ref: ./examples/AnalyzeFromUrl_MaximumSet.json
definitions:
Azure.Core.Foundations.Error:
type: object
description: The error object.
properties:
code:
type: string
description: One of a server-defined set of error codes.
message:
type: string
description: A human-readable representation of the error.
target:
type: string
description: The target of the error.
details:
type: array
description: >-
An array of details about specific errors that led to this reported
error.
items:
$ref: '#/definitions/Azure.Core.Foundations.Error'
x-ms-identifiers: []
innererror:
$ref: '#/definitions/Azure.Core.Foundations.InnerError'
description: >-
An object containing more specific information than the current object
about the error.
required:
- code
- message
Azure.Core.Foundations.ErrorResponse:
type: object
description: A response containing error details.
properties:
error:
$ref: '#/definitions/Azure.Core.Foundations.Error'
description: The error object.
required:
- error
Azure.Core.Foundations.InnerError:
type: object
description: >-
An object containing more specific information about the error. As per
Microsoft One API guidelines -
https://github.com/Microsoft/api-guidelines/blob/vNext/Guidelines.md#7102-error-condition-responses.
properties:
code:
type: string
description: One of a server-defined set of error codes.
innererror:
$ref: '#/definitions/Azure.Core.Foundations.InnerError'
description: Inner error.
CaptionResult:
type: object
description: >-
Represents a generated phrase that describes the content of the whole
image.
properties:
confidence:
type: number
format: float
description: >-
A score, in the range of 0 to 1 (inclusive), representing the
confidence that this description is accurate.
Higher values indicating higher confidence.
minimum: 0
maximum: 1
text:
type: string
description: The text of the caption.
minLength: 1
required:
- confidence
- text
CropRegion:
type: object
description: >-
A region at the desired aspect ratio that can be used as image thumbnail.
The region preserves as much content as possible from the analyzed image,
with priority given to detected faces.
properties:
aspectRatio:
type: number
format: float
description: >-
The aspect ratio of the crop region.
Aspect ratio is calculated by dividing the width of the region in
pixels by its height in pixels.
The aspect ratio will be in the range 0.75 to 1.8 (inclusive) if
provided by the developer during the analyze call.
Otherwise, it will be in the range 0.5 to 2.0 (inclusive).
minimum: 0
boundingBox:
$ref: '#/definitions/ImageBoundingBox'
description: The bounding box of the region.
required:
- aspectRatio
- boundingBox
DenseCaption:
type: object
description: >-
Represents a generated phrase that describes the content of the whole
image or a region in the image
properties:
confidence:
type: number
format: float
description: >-
A score, in the range of 0 to 1 (inclusive), representing the
confidence that this description is accurate.
Higher values indicating higher confidence.
minimum: 0
maximum: 1
text:
type: string
description: The text of the caption.
minLength: 1
boundingBox:
$ref: '#/definitions/ImageBoundingBox'
description: The image region of which this caption applies.
required:
- confidence
- text
- boundingBox
DenseCaptionsResult:
type: object
description: >-
Represents a list of up to 10 image captions for different regions of the
image.
The first caption always applies to the whole image.
properties:
values:
type: array
description: The list of image captions.
minItems: 1
items:
$ref: '#/definitions/DenseCaption'
x-ms-identifiers: []
required:
- values
DetectedObject:
type: object
description: Represents a physical object detected in an image.
properties:
boundingBox:
$ref: '#/definitions/ImageBoundingBox'
description: A rectangular boundary where the object was detected.
tags:
type: array
description: A single-item list containing the object information.
minItems: 0
items:
$ref: '#/definitions/DetectedTag'
x-ms-identifiers: []
required:
- boundingBox
- tags
DetectedPerson:
type: object
description: Represents a person detected in an image.
properties:
boundingBox:
$ref: '#/definitions/ImageBoundingBox'
description: A rectangular boundary where the person was detected.
readOnly: true
confidence:
type: number
format: float
description: >-
A score, in the range of 0 to 1 (inclusive), representing the
confidence that this detection was accurate.
Higher values indicating higher confidence.
minimum: 0
maximum: 1
readOnly: true
required:
- boundingBox
- confidence
DetectedTag:
type: object
description: >-
A content entity observation in the image. A tag can be a physical object,
living being, scenery, or action
that appear in the image.
properties:
confidence:
type: number
format: float
description: >-
A score, in the range of 0 to 1 (inclusive), representing the
confidence that this entity was observed.
Higher values indicating higher confidence.
minimum: 0
maximum: 1
name:
type: string
description: Name of the entity.
minLength: 1
required:
- confidence
- name
DetectedTextBlock:
type: object
description: Represents a single block of detected text in the image.
properties:
lines:
type: array
description: A list of text lines in this block.
minItems: 1
items:
$ref: '#/definitions/DetectedTextLine'
x-ms-identifiers: []
required:
- lines
DetectedTextLine:
type: object
description: Represents a single line of text in the image.
properties:
text:
type: string
description: Text content of the detected text line.
minLength: 1
boundingPolygon:
type: array
description: >-
A bounding polygon around the text line. At the moment only
quadrilaterals are supported (represented by 4 image points).
minItems: 4
maxItems: 4
items:
$ref: '#/definitions/ImagePoint'
x-ms-identifiers: []
words:
type: array
description: A list of words in this line.
minItems: 1
items:
$ref: '#/definitions/DetectedTextWord'
x-ms-identifiers: []
required:
- text
- boundingPolygon
- words
DetectedTextWord:
type: object
description: "A word object consisting of a contiguous sequence of characters. For non-space delimited languages,\r\nsuch as Chinese, Japanese, and Korean, each character is represented as its own word."
properties:
text:
type: string
description: Text content of the word.
minLength: 1
boundingPolygon:
type: array
description: >-
A bounding polygon around the word. At the moment only quadrilaterals
are supported (represented by 4 image points).
minItems: 4
maxItems: 4
items:
$ref: '#/definitions/ImagePoint'
x-ms-identifiers: []
confidence:
type: number
format: float
description: >-
The level of confidence that the word was detected. Confidence scores
span the range of 0.0 to 1.0 (inclusive), with higher values
indicating a higher confidence of detection.
minimum: 0
maximum: 1
required:
- text
- boundingPolygon
- confidence
ImageAnalysisResult:
type: object
description: Represents the outcome of an Image Analysis operation.
properties:
captionResult:
$ref: '#/definitions/CaptionResult'
description: The generated phrase that describes the content of the analyzed image.
x-ms-client-name: caption
denseCaptionsResult:
$ref: '#/definitions/DenseCaptionsResult'
description: >-
The up to 10 generated phrases, the first describing the content of
the whole image,
and the others describing the content of different regions of the
image.
x-ms-client-name: denseCaptions
metadata:
$ref: '#/definitions/ImageMetadata'
description: Metadata associated with the analyzed image.
modelVersion:
type: string
description: The cloud AI model used for the analysis
objectsResult:
$ref: '#/definitions/ObjectsResult'
description: >-
A list of detected physical objects in the analyzed image, and their
location.
x-ms-client-name: objects
peopleResult:
$ref: '#/definitions/PeopleResult'
description: A list of detected people in the analyzed image, and their location.
x-ms-client-name: people
readResult:
$ref: '#/definitions/ReadResult'
description: >-
The extracted printed and hand-written text in the analyze image. Also
knows as OCR.
x-ms-client-name: read
smartCropsResult:
$ref: '#/definitions/SmartCropsResult'
description: >-
A list of crop regions at the desired as aspect ratios (if provided)
that can be used as image thumbnails.
These regions preserve as much content as possible from the analyzed
image, with priority given to detected faces.
x-ms-client-name: smartCrops
tagsResult:
$ref: '#/definitions/TagsResult'
description: A list of content tags in the analyzed image.
x-ms-client-name: tags
required:
- metadata
- modelVersion
ImageBoundingBox:
type: object
description: A basic rectangle specifying a sub-region of the image.
properties:
x:
type: integer
format: int32
description: X-coordinate of the top left point of the area, in pixels.
minimum: 0
'y':
type: integer
format: int32
description: Y-coordinate of the top left point of the area, in pixels.
minimum: 0
w:
type: integer
format: int32
description: Width of the area, in pixels.
minimum: 0
x-ms-client-name: width
h:
type: integer
format: int32
description: Height of the area, in pixels.
minimum: 0
x-ms-client-name: height
required:
- x
- 'y'
- w
- h
ImageMetadata:
type: object
description: Metadata associated with the analyzed image.
properties:
height:
type: integer
format: int32
description: The height of the image in pixels.
minimum: 1
width:
type: integer
format: int32
description: The width of the image in pixels.
minimum: 1
required:
- height
- width
ImagePoint:
type: object
description: Represents the coordinates of a single pixel in the image.
properties:
x:
type: integer
format: int32
description: >-
The horizontal x-coordinate of this point, in pixels. Zero values
corresponds to the left-most pixels in the image.
minimum: 0
'y':
type: integer
format: int32
description: >-
The vertical y-coordinate of this point, in pixels. Zero values
corresponds to the top-most pixels in the image.
minimum: 0
required:
- x
- 'y'
ImageUrl:
type: object
description: An object holding the publicly reachable URL of an image to analyze.
properties:
url:
type: string
format: uri
description: Publicly reachable URL of an image to analyze.
required:
- url
ObjectsResult:
type: object
description: >-
Represents a list of physical object detected in an image and their
location.
properties:
values:
type: array
description: A list of physical object detected in an image and their location.
minItems: 0
items:
$ref: '#/definitions/DetectedObject'
x-ms-identifiers: []
required:
- values
PeopleResult:
type: object
description: Represents a list of people detected in an image and their location.
properties:
values:
type: array
description: A list of people detected in an image and their location.
minItems: 0
items:
$ref: '#/definitions/DetectedPerson'
x-ms-identifiers: []
required:
- values
ReadResult:
type: object
description: The results of a Read (OCR) operation.
properties:
blocks:
type: array
description: >-
A list of text blocks in the image. At the moment only one block is
returned, containing all the text detected in the image.
minItems: 1
maxItems: 1
items:
$ref: '#/definitions/DetectedTextBlock'
x-ms-identifiers: []
required:
- blocks
SmartCropsResult:
type: object
description: >-
Smart cropping result. A list of crop regions at the desired as aspect
ratios (if provided) that can be used as image thumbnails.
These regions preserve as much content as possible from the analyzed
image, with priority given to detected faces.
properties:
values:
type: array
description: A list of crop regions.
minItems: 1
items:
$ref: '#/definitions/CropRegion'
x-ms-identifiers: []
required:
- values
TagsResult:
type: object
description: >-
A list of entities observed in the image. Tags can be physical objects,
living being, scenery, or actions
that appear in the image.
properties:
values:
type: array
description: A list of tags.
minItems: 0
items:
$ref: '#/definitions/DetectedTag'
x-ms-identifiers: []
required:
- values
parameters:
Azure.Core.Foundations.ApiVersionParameter:
name: api-version
in: query
description: The API version to use for this operation.
required: true
type: string
minLength: 1
x-ms-parameter-location: method
x-ms-client-name: apiVersion