Microsoft Azure
Microsoft Azure Image Analysis

Microsoft Azure Image Analysis is a powerful tool that allows users to easily analyze and understand the content of images using advanced machine learning algorithms. With this service, users can extract valuable information from images such as objects, faces, text, and even emotions. This tool can be used for a variety of applications, from automated image tagging and sorting to facial recognition and sentiment analysis.
Documentation GitHub OpenAPI
Documentation

📖
Documentation
https://learn.microsoft.com/en-us/azure/ai-services/computer-vision/
Specifications

⚙
OpenAPI
https://raw.githubusercontent.com/api-evangelist/microsoft-azure/refs/heads/main/openapi/image-analysis-openapi-original.yml
OpenAPI Specification

swagger: '2.0'
info:
  title: Microsoft Azure Image Analysis
  version: '2023-10-01'
  x-typespec-generated:
    - emitter: '@azure-tools/typespec-autorest'
schemes:
  - https
x-ms-parameterized-host:
  hostTemplate: '{endpoint}/computervision'
  useSchemePrefix: false
  parameters:
    - name: endpoint
      in: path
      description: |-
        Azure AI Computer Vision endpoint (protocol and hostname, for example:
        https://<resource-name>.cognitiveservices.azure.com).
      required: true
      type: string
      format: uri
      x-ms-skip-url-encoding: true
produces:
  - application/json
consumes:
  - application/json
security:
  - ApiKeyAuth: []
  - AadOauth2Auth:
      - https://cognitiveservices.azure.com/.default
securityDefinitions:
  AadOauth2Auth:
    type: oauth2
    description: The Azure Active Directory OAuth2 Flow
    flow: accessCode
    authorizationUrl: https://login.microsoftonline.com/common/oauth2/authorize
    scopes:
      https://cognitiveservices.azure.com/.default: ''
    tokenUrl: https://login.microsoftonline.com/common/oauth2/token
  ApiKeyAuth:
    type: apiKey
    name: Ocp-Apim-Subscription-Key
    in: header
tags:
  - name: Imageanalysis:analyze
paths:
  /imageanalysis:analyze:
    post:
      operationId: microsoftAzureAnalyzefromimagedata
      description: Performs a single Image Analysis operation
      consumes:
        - application/octet-stream
      parameters:
        - $ref: '#/parameters/Azure.Core.Foundations.ApiVersionParameter'
        - name: features
          in: query
          description: >-
            A list of visual features to analyze.

            Seven visual features are supported: Caption, DenseCaptions, Read
            (OCR), Tags, Objects, SmartCrops, and People.

            At least one visual feature must be specified.
          required: true
          type: array
          items:
            type: string
            enum:
              - tags
              - caption
              - denseCaptions
              - objects
              - read
              - smartCrops
              - people
            x-ms-enum:
              name: VisualFeatures
              modelAsString: true
              values:
                - name: tags
                  value: tags
                  description: >-
                    Extract content tags for thousands of recognizable objects,
                    living beings, scenery, and actions that appear in the
                    image.
                - name: caption
                  value: caption
                  description: >-
                    Generate a human-readable caption sentence that describes
                    the content of the image.
                - name: denseCaptions
                  value: denseCaptions
                  description: >-
                    Generate human-readable caption sentences for up to 10
                    different regions in the image, including one for the whole
                    image.
                - name: objects
                  value: objects
                  description: >-
                    Object detection. This is similar to tags, but focused on
                    detecting physical objects in the image and returning their
                    location.
                - name: read
                  value: read
                  description: >-
                    Extract printed or handwritten text from the image. Also
                    known as Optical Character Recognition (OCR).
                - name: smartCrops
                  value: smartCrops
                  description: >-
                    Find representative sub-regions of the image for thumbnail
                    generation, at desired aspect ratios, with priority given to
                    detected faces.
                - name: people
                  value: people
                  description: Detect people in the image and return their location.
          collectionFormat: csv
          minItems: 1
          x-ms-client-name: visualFeatures
        - name: language
          in: query
          description: >-
            The desired language for result generation (a two-letter language
            code).

            If this option is not specified, the default value 'en' is used
            (English).

            See https://aka.ms/cv-languages for a list of supported languages.
          required: false
          type: string
          default: en
          minLength: 2
        - name: gender-neutral-caption
          in: query
          description: >-
            Boolean flag for enabling gender-neutral captioning for Caption and
            Dense Captions features.

            By default captions may contain gender terms (for example: 'man',
            'woman', or 'boy', 'girl'). 

            If you set this to "true", those will be replaced with
            gender-neutral terms (for example: 'person' or 'child').
          required: false
          type: boolean
          default: false
          x-ms-client-name: genderNeutralCaption
        - name: smartcrops-aspect-ratios
          in: query
          description: >-
            A list of aspect ratios to use for smart cropping.

            Aspect ratios are calculated by dividing the target crop width in
            pixels by the height in pixels.

            Supported values are between 0.75 and 1.8 (inclusive).

            If this parameter is not specified, the service will return one crop
            region with an aspect

            ratio it sees fit between 0.5 and 2.0 (inclusive).
          required: false
          type: array
          items:
            type: number
            format: float
          collectionFormat: csv
          x-ms-client-name: smartCropsAspectRatios
        - name: model-version
          in: query
          description: >-
            The version of cloud AI-model used for analysis.

            The format is the following: 'latest' (default value) or
            'YYYY-MM-DD' or 'YYYY-MM-DD-preview', where 'YYYY', 'MM', 'DD' are
            the year, month and day associated with the model.

            This is not commonly set, as the default always gives the latest AI
            model with recent improvements.

            If however you would like to make sure analysis results do not
            change over time, set this value to a specific model version.
          required: false
          type: string
          default: latest
          minLength: 6
          maxLength: 18
          pattern: ^(latest|\d{4}-\d{2}-\d{2})(-preview)?$
          x-ms-client-name: modelVersion
        - name: imageData
          in: body
          description: The image to be analyzed
          required: true
          schema:
            type: string
            format: binary
      responses:
        '200':
          description: The request has succeeded.
          schema:
            $ref: '#/definitions/ImageAnalysisResult'
        default:
          description: An unexpected error response.
          schema:
            $ref: '#/definitions/Azure.Core.Foundations.ErrorResponse'
          headers:
            x-ms-error-code:
              type: string
              description: String error code indicating what went wrong.
      x-ms-examples:
        AnalyzeFromImageData:
          $ref: ./examples/AnalyzeFromImageData_MaximumSet.json
      summary: Microsoft Azure Post Imageanalysis:analyze
      tags:
        - Imageanalysis:analyze
x-ms-paths:
  /imageanalysis:analyze?_overload=analyzeFromUrl:
    post:
      operationId: AnalyzeFromUrl
      description: Performs a single Image Analysis operation
      parameters:
        - $ref: '#/parameters/Azure.Core.Foundations.ApiVersionParameter'
        - name: features
          in: query
          description: >-
            A list of visual features to analyze.

            Seven visual features are supported: Caption, DenseCaptions, Read
            (OCR), Tags, Objects, SmartCrops, and People.

            At least one visual feature must be specified.
          required: true
          type: array
          items:
            type: string
            enum:
              - tags
              - caption
              - denseCaptions
              - objects
              - read
              - smartCrops
              - people
            x-ms-enum:
              name: VisualFeatures
              modelAsString: true
              values:
                - name: tags
                  value: tags
                  description: >-
                    Extract content tags for thousands of recognizable objects,
                    living beings, scenery, and actions that appear in the
                    image.
                - name: caption
                  value: caption
                  description: >-
                    Generate a human-readable caption sentence that describes
                    the content of the image.
                - name: denseCaptions
                  value: denseCaptions
                  description: >-
                    Generate human-readable caption sentences for up to 10
                    different regions in the image, including one for the whole
                    image.
                - name: objects
                  value: objects
                  description: >-
                    Object detection. This is similar to tags, but focused on
                    detecting physical objects in the image and returning their
                    location.
                - name: read
                  value: read
                  description: >-
                    Extract printed or handwritten text from the image. Also
                    known as Optical Character Recognition (OCR).
                - name: smartCrops
                  value: smartCrops
                  description: >-
                    Find representative sub-regions of the image for thumbnail
                    generation, at desired aspect ratios, with priority given to
                    detected faces.
                - name: people
                  value: people
                  description: Detect people in the image and return their location.
          collectionFormat: csv
          minItems: 1
          x-ms-client-name: visualFeatures
        - name: language
          in: query
          description: >-
            The desired language for result generation (a two-letter language
            code).

            If this option is not specified, the default value 'en' is used
            (English).

            See https://aka.ms/cv-languages for a list of supported languages.
          required: false
          type: string
          default: en
          minLength: 2
        - name: gender-neutral-caption
          in: query
          description: >-
            Boolean flag for enabling gender-neutral captioning for Caption and
            Dense Captions features.

            By default captions may contain gender terms (for example: 'man',
            'woman', or 'boy', 'girl'). 

            If you set this to "true", those will be replaced with
            gender-neutral terms (for example: 'person' or 'child').
          required: false
          type: boolean
          default: false
          x-ms-client-name: genderNeutralCaption
        - name: smartcrops-aspect-ratios
          in: query
          description: >-
            A list of aspect ratios to use for smart cropping.

            Aspect ratios are calculated by dividing the target crop width in
            pixels by the height in pixels.

            Supported values are between 0.75 and 1.8 (inclusive).

            If this parameter is not specified, the service will return one crop
            region with an aspect

            ratio it sees fit between 0.5 and 2.0 (inclusive).
          required: false
          type: array
          items:
            type: number
            format: float
          collectionFormat: csv
          x-ms-client-name: smartCropsAspectRatios
        - name: model-version
          in: query
          description: >-
            The version of cloud AI-model used for analysis.

            The format is the following: 'latest' (default value) or
            'YYYY-MM-DD' or 'YYYY-MM-DD-preview', where 'YYYY', 'MM', 'DD' are
            the year, month and day associated with the model.

            This is not commonly set, as the default always gives the latest AI
            model with recent improvements.

            If however you would like to make sure analysis results do not
            change over time, set this value to a specific model version.
          required: false
          type: string
          default: latest
          minLength: 6
          maxLength: 18
          pattern: ^(latest|\d{4}-\d{2}-\d{2})(-preview)?$
          x-ms-client-name: modelVersion
        - name: imageUrl
          in: body
          description: The image to be analyzed
          required: true
          schema:
            $ref: '#/definitions/ImageUrl'
      responses:
        '200':
          description: The request has succeeded.
          schema:
            $ref: '#/definitions/ImageAnalysisResult'
        default:
          description: An unexpected error response.
          schema:
            $ref: '#/definitions/Azure.Core.Foundations.ErrorResponse'
          headers:
            x-ms-error-code:
              type: string
              description: String error code indicating what went wrong.
      x-ms-examples:
        AnalyzeFromUrl:
          $ref: ./examples/AnalyzeFromUrl_MaximumSet.json
definitions:
  Azure.Core.Foundations.Error:
    type: object
    description: The error object.
    properties:
      code:
        type: string
        description: One of a server-defined set of error codes.
      message:
        type: string
        description: A human-readable representation of the error.
      target:
        type: string
        description: The target of the error.
      details:
        type: array
        description: >-
          An array of details about specific errors that led to this reported
          error.
        items:
          $ref: '#/definitions/Azure.Core.Foundations.Error'
        x-ms-identifiers: []
      innererror:
        $ref: '#/definitions/Azure.Core.Foundations.InnerError'
        description: >-
          An object containing more specific information than the current object
          about the error.
    required:
      - code
      - message
  Azure.Core.Foundations.ErrorResponse:
    type: object
    description: A response containing error details.
    properties:
      error:
        $ref: '#/definitions/Azure.Core.Foundations.Error'
        description: The error object.
    required:
      - error
  Azure.Core.Foundations.InnerError:
    type: object
    description: >-
      An object containing more specific information about the error. As per
      Microsoft One API guidelines -
      https://github.com/Microsoft/api-guidelines/blob/vNext/Guidelines.md#7102-error-condition-responses.
    properties:
      code:
        type: string
        description: One of a server-defined set of error codes.
      innererror:
        $ref: '#/definitions/Azure.Core.Foundations.InnerError'
        description: Inner error.
  CaptionResult:
    type: object
    description: >-
      Represents a generated phrase that describes the content of the whole
      image.
    properties:
      confidence:
        type: number
        format: float
        description: >-
          A score, in the range of 0 to 1 (inclusive), representing the
          confidence that this description is accurate.

          Higher values indicating higher confidence.
        minimum: 0
        maximum: 1
      text:
        type: string
        description: The text of the caption.
        minLength: 1
    required:
      - confidence
      - text
  CropRegion:
    type: object
    description: >-
      A region at the desired aspect ratio that can be used as image thumbnail.

      The region preserves as much content as possible from the analyzed image,
      with priority given to detected faces.
    properties:
      aspectRatio:
        type: number
        format: float
        description: >-
          The aspect ratio of the crop region.

          Aspect ratio is calculated by dividing the width of the region in
          pixels by its height in pixels.

          The aspect ratio will be in the range 0.75 to 1.8 (inclusive) if
          provided by the developer during the analyze call.

          Otherwise, it will be in the range 0.5 to 2.0 (inclusive).
        minimum: 0
      boundingBox:
        $ref: '#/definitions/ImageBoundingBox'
        description: The bounding box of the region.
    required:
      - aspectRatio
      - boundingBox
  DenseCaption:
    type: object
    description: >-
      Represents a generated phrase that describes the content of the whole
      image or a region in the image
    properties:
      confidence:
        type: number
        format: float
        description: >-
          A score, in the range of 0 to 1 (inclusive), representing the
          confidence that this description is accurate.

          Higher values indicating higher confidence.
        minimum: 0
        maximum: 1
      text:
        type: string
        description: The text of the caption.
        minLength: 1
      boundingBox:
        $ref: '#/definitions/ImageBoundingBox'
        description: The image region of which this caption applies.
    required:
      - confidence
      - text
      - boundingBox
  DenseCaptionsResult:
    type: object
    description: >-
      Represents a list of up to 10 image captions for different regions of the
      image.

      The first caption always applies to the whole image.
    properties:
      values:
        type: array
        description: The list of image captions.
        minItems: 1
        items:
          $ref: '#/definitions/DenseCaption'
        x-ms-identifiers: []
    required:
      - values
  DetectedObject:
    type: object
    description: Represents a physical object detected in an image.
    properties:
      boundingBox:
        $ref: '#/definitions/ImageBoundingBox'
        description: A rectangular boundary where the object was detected.
      tags:
        type: array
        description: A single-item list containing the object information.
        minItems: 0
        items:
          $ref: '#/definitions/DetectedTag'
        x-ms-identifiers: []
    required:
      - boundingBox
      - tags
  DetectedPerson:
    type: object
    description: Represents a person detected in an image.
    properties:
      boundingBox:
        $ref: '#/definitions/ImageBoundingBox'
        description: A rectangular boundary where the person was detected.
        readOnly: true
      confidence:
        type: number
        format: float
        description: >-
          A score, in the range of 0 to 1 (inclusive), representing the
          confidence that this detection was accurate.

          Higher values indicating higher confidence.
        minimum: 0
        maximum: 1
        readOnly: true
    required:
      - boundingBox
      - confidence
  DetectedTag:
    type: object
    description: >-
      A content entity observation in the image. A tag can be a physical object,
      living being, scenery, or action

      that appear in the image.
    properties:
      confidence:
        type: number
        format: float
        description: >-
          A score, in the range of 0 to 1 (inclusive), representing the
          confidence that this entity was observed.

          Higher values indicating higher confidence.
        minimum: 0
        maximum: 1
      name:
        type: string
        description: Name of the entity.
        minLength: 1
    required:
      - confidence
      - name
  DetectedTextBlock:
    type: object
    description: Represents a single block of detected text in the image.
    properties:
      lines:
        type: array
        description: A list of text lines in this block.
        minItems: 1
        items:
          $ref: '#/definitions/DetectedTextLine'
        x-ms-identifiers: []
    required:
      - lines
  DetectedTextLine:
    type: object
    description: Represents a single line of text in the image.
    properties:
      text:
        type: string
        description: Text content of the detected text line.
        minLength: 1
      boundingPolygon:
        type: array
        description: >-
          A bounding polygon around the text line. At the moment only
          quadrilaterals are supported (represented by 4 image points).
        minItems: 4
        maxItems: 4
        items:
          $ref: '#/definitions/ImagePoint'
        x-ms-identifiers: []
      words:
        type: array
        description: A list of words in this line.
        minItems: 1
        items:
          $ref: '#/definitions/DetectedTextWord'
        x-ms-identifiers: []
    required:
      - text
      - boundingPolygon
      - words
  DetectedTextWord:
    type: object
    description: "A word object consisting of a contiguous sequence of characters. For non-space delimited languages,\r\nsuch as Chinese, Japanese, and Korean, each character is represented as its own word."
    properties:
      text:
        type: string
        description: Text content of the word.
        minLength: 1
      boundingPolygon:
        type: array
        description: >-
          A bounding polygon around the word. At the moment only quadrilaterals
          are supported (represented by 4 image points).
        minItems: 4
        maxItems: 4
        items:
          $ref: '#/definitions/ImagePoint'
        x-ms-identifiers: []
      confidence:
        type: number
        format: float
        description: >-
          The level of confidence that the word was detected. Confidence scores
          span the range of 0.0 to 1.0 (inclusive), with higher values
          indicating a higher confidence of detection.
        minimum: 0
        maximum: 1
    required:
      - text
      - boundingPolygon
      - confidence
  ImageAnalysisResult:
    type: object
    description: Represents the outcome of an Image Analysis operation.
    properties:
      captionResult:
        $ref: '#/definitions/CaptionResult'
        description: The generated phrase that describes the content of the analyzed image.
        x-ms-client-name: caption
      denseCaptionsResult:
        $ref: '#/definitions/DenseCaptionsResult'
        description: >-
          The up to 10 generated phrases, the first describing the content of
          the whole image,

          and the others describing the content of different regions of the
          image.
        x-ms-client-name: denseCaptions
      metadata:
        $ref: '#/definitions/ImageMetadata'
        description: Metadata associated with the analyzed image.
      modelVersion:
        type: string
        description: The cloud AI model used for the analysis
      objectsResult:
        $ref: '#/definitions/ObjectsResult'
        description: >-
          A list of detected physical objects in the analyzed image, and their
          location.
        x-ms-client-name: objects
      peopleResult:
        $ref: '#/definitions/PeopleResult'
        description: A list of detected people in the analyzed image, and their location.
        x-ms-client-name: people
      readResult:
        $ref: '#/definitions/ReadResult'
        description: >-
          The extracted printed and hand-written text in the analyze image. Also
          knows as OCR.
        x-ms-client-name: read
      smartCropsResult:
        $ref: '#/definitions/SmartCropsResult'
        description: >-
          A list of crop regions at the desired as aspect ratios (if provided)
          that can be used as image thumbnails.

          These regions preserve as much content as possible from the analyzed
          image, with priority given to detected faces.
        x-ms-client-name: smartCrops
      tagsResult:
        $ref: '#/definitions/TagsResult'
        description: A list of content tags in the analyzed image.
        x-ms-client-name: tags
    required:
      - metadata
      - modelVersion
  ImageBoundingBox:
    type: object
    description: A basic rectangle specifying a sub-region of the image.
    properties:
      x:
        type: integer
        format: int32
        description: X-coordinate of the top left point of the area, in pixels.
        minimum: 0
      'y':
        type: integer
        format: int32
        description: Y-coordinate of the top left point of the area, in pixels.
        minimum: 0
      w:
        type: integer
        format: int32
        description: Width of the area, in pixels.
        minimum: 0
        x-ms-client-name: width
      h:
        type: integer
        format: int32
        description: Height of the area, in pixels.
        minimum: 0
        x-ms-client-name: height
    required:
      - x
      - 'y'
      - w
      - h
  ImageMetadata:
    type: object
    description: Metadata associated with the analyzed image.
    properties:
      height:
        type: integer
        format: int32
        description: The height of the image in pixels.
        minimum: 1
      width:
        type: integer
        format: int32
        description: The width of the image in pixels.
        minimum: 1
    required:
      - height
      - width
  ImagePoint:
    type: object
    description: Represents the coordinates of a single pixel in the image.
    properties:
      x:
        type: integer
        format: int32
        description: >-
          The horizontal x-coordinate of this point, in pixels. Zero values
          corresponds to the left-most pixels in the image.
        minimum: 0
      'y':
        type: integer
        format: int32
        description: >-
          The vertical y-coordinate of this point, in pixels. Zero values
          corresponds to the top-most pixels in the image.
        minimum: 0
    required:
      - x
      - 'y'
  ImageUrl:
    type: object
    description: An object holding the publicly reachable URL of an image to analyze.
    properties:
      url:
        type: string
        format: uri
        description: Publicly reachable URL of an image to analyze.
    required:
      - url
  ObjectsResult:
    type: object
    description: >-
      Represents a list of physical object detected in an image and their
      location.
    properties:
      values:
        type: array
        description: A list of physical object detected in an image and their location.
        minItems: 0
        items:
          $ref: '#/definitions/DetectedObject'
        x-ms-identifiers: []
    required:
      - values
  PeopleResult:
    type: object
    description: Represents a list of people detected in an image and their location.
    properties:
      values:
        type: array
        description: A list of people detected in an image and their location.
        minItems: 0
        items:
          $ref: '#/definitions/DetectedPerson'
        x-ms-identifiers: []
    required:
      - values
  ReadResult:
    type: object
    description: The results of a Read (OCR) operation.
    properties:
      blocks:
        type: array
        description: >-
          A list of text blocks in the image. At the moment only one block is
          returned, containing all the text detected in the image.
        minItems: 1
        maxItems: 1
        items:
          $ref: '#/definitions/DetectedTextBlock'
        x-ms-identifiers: []
    required:
      - blocks
  SmartCropsResult:
    type: object
    description: >-
      Smart cropping result. A list of crop regions at the desired as aspect
      ratios (if provided) that can be used as image thumbnails.

      These regions preserve as much content as possible from the analyzed
      image, with priority given to detected faces.
    properties:
      values:
        type: array
        description: A list of crop regions.
        minItems: 1
        items:
          $ref: '#/definitions/CropRegion'
        x-ms-identifiers: []
    required:
      - values
  TagsResult:
    type: object
    description: >-
      A list of entities observed in the image. Tags can be physical objects,
      living being, scenery, or actions

      that appear in the image.
    properties:
      values:
        type: array
        description: A list of tags.
        minItems: 0
        items:
          $ref: '#/definitions/DetectedTag'
        x-ms-identifiers: []
    required:
      - values
parameters:
  Azure.Core.Foundations.ApiVersionParameter:
    name: api-version
    in: query
    description: The API version to use for this operation.
    required: true
    type: string
    minLength: 1
    x-ms-parameter-location: method
    x-ms-client-name: apiVersion