Cloud Dataproc API

The Cloud Dataproc API manages Hadoop-based clusters and jobs on Google Cloud. It provides programmatic access to create, configure, and delete clusters, submit and monitor Apache Spark, Hadoop, Hive, and Pig jobs, and manage workflow templates for orchestrating multi-step data processing pipelines. The API supports autoscaling policies, optional components, and integration with other Google Cloud services.

OpenAPI Specification

cloud-dataproc-openapi.yml Raw ↑
openapi: 3.1.0
info:
  title: Google Cloud Dataproc API
  description: >-
    The Cloud Dataproc API manages Hadoop-based clusters and jobs on Google Cloud
    Platform. It provides programmatic access to create and manage clusters,
    submit and monitor Apache Spark, Apache Hadoop, Apache Hive, and Apache Pig
    jobs, and manage workflow templates for orchestrating multi-step data
    processing pipelines.
  version: v1
  contact:
    name: Google Cloud
    url: https://cloud.google.com/dataproc/docs/reference/rest
  license:
    name: Apache 2.0
    url: https://www.apache.org/licenses/LICENSE-2.0
servers:
  - url: https://dataproc.googleapis.com/v1
    description: Cloud Dataproc API v1
tags:
  - name: Clusters
    description: Operations on Dataproc clusters
  - name: Jobs
    description: Operations on Dataproc jobs
  - name: WorkflowTemplates
    description: Operations on workflow templates
paths:
  /projects/{project}/regions/{region}/clusters:
    get:
      tags:
        - Clusters
      summary: Google Cloud Dataproc List clusters
      description: Lists all regions/{region}/clusters in a project.
      operationId: listClusters
      parameters:
        - name: project
          in: path
          required: true
          schema:
            type: string
        - name: region
          in: path
          required: true
          schema:
            type: string
        - name: filter
          in: query
          schema:
            type: string
        - name: pageSize
          in: query
          schema:
            type: integer
        - name: pageToken
          in: query
          schema:
            type: string
      responses:
        '200':
          description: Successful response
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/ListClustersResponse'
      security:
        - oauth2:
            - https://www.googleapis.com/auth/cloud-platform
    post:
      tags:
        - Clusters
      summary: Google Cloud Dataproc Create a cluster
      description: Creates a cluster in a project. The returned Operation.metadata will be ClusterOperationMetadata.
      operationId: createCluster
      parameters:
        - name: project
          in: path
          required: true
          schema:
            type: string
        - name: region
          in: path
          required: true
          schema:
            type: string
      requestBody:
        required: true
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/Cluster'
      responses:
        '200':
          description: Cluster creation initiated
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/Operation'
      security:
        - oauth2:
            - https://www.googleapis.com/auth/cloud-platform
  /projects/{project}/regions/{region}/clusters/{cluster}:
    get:
      tags:
        - Clusters
      summary: Google Cloud Dataproc Get a cluster
      description: Gets the resource representation for a cluster in a project.
      operationId: getCluster
      parameters:
        - name: project
          in: path
          required: true
          schema:
            type: string
        - name: region
          in: path
          required: true
          schema:
            type: string
        - name: cluster
          in: path
          required: true
          schema:
            type: string
      responses:
        '200':
          description: Successful response
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/Cluster'
      security:
        - oauth2:
            - https://www.googleapis.com/auth/cloud-platform
    patch:
      tags:
        - Clusters
      summary: Google Cloud Dataproc Update a cluster
      description: Updates a cluster in a project. The returned Operation.metadata will be ClusterOperationMetadata.
      operationId: updateCluster
      parameters:
        - name: project
          in: path
          required: true
          schema:
            type: string
        - name: region
          in: path
          required: true
          schema:
            type: string
        - name: cluster
          in: path
          required: true
          schema:
            type: string
        - name: updateMask
          in: query
          required: true
          schema:
            type: string
      requestBody:
        required: true
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/Cluster'
      responses:
        '200':
          description: Cluster update initiated
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/Operation'
      security:
        - oauth2:
            - https://www.googleapis.com/auth/cloud-platform
    delete:
      tags:
        - Clusters
      summary: Google Cloud Dataproc Delete a cluster
      description: Deletes a cluster in a project. The returned Operation.metadata will be ClusterOperationMetadata.
      operationId: deleteCluster
      parameters:
        - name: project
          in: path
          required: true
          schema:
            type: string
        - name: region
          in: path
          required: true
          schema:
            type: string
        - name: cluster
          in: path
          required: true
          schema:
            type: string
      responses:
        '200':
          description: Cluster deletion initiated
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/Operation'
      security:
        - oauth2:
            - https://www.googleapis.com/auth/cloud-platform
  /projects/{project}/regions/{region}/jobs:
    get:
      tags:
        - Jobs
      summary: Google Cloud Dataproc List jobs
      description: Lists regions/{region}/jobs in a project.
      operationId: listJobs
      parameters:
        - name: project
          in: path
          required: true
          schema:
            type: string
        - name: region
          in: path
          required: true
          schema:
            type: string
        - name: pageSize
          in: query
          schema:
            type: integer
        - name: pageToken
          in: query
          schema:
            type: string
        - name: clusterName
          in: query
          schema:
            type: string
        - name: jobStateMatcher
          in: query
          schema:
            type: string
            enum:
              - ALL
              - ACTIVE
              - NON_ACTIVE
      responses:
        '200':
          description: Successful response
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/ListJobsResponse'
      security:
        - oauth2:
            - https://www.googleapis.com/auth/cloud-platform
  /projects/{project}/regions/{region}/jobs:submit:
    post:
      tags:
        - Jobs
      summary: Google Cloud Dataproc Submit a job
      description: Submits a job to a cluster.
      operationId: submitJob
      parameters:
        - name: project
          in: path
          required: true
          schema:
            type: string
        - name: region
          in: path
          required: true
          schema:
            type: string
      requestBody:
        required: true
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/SubmitJobRequest'
      responses:
        '200':
          description: Job submitted successfully
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/Job'
      security:
        - oauth2:
            - https://www.googleapis.com/auth/cloud-platform
  /projects/{project}/regions/{region}/jobs/{jobId}:
    get:
      tags:
        - Jobs
      summary: Google Cloud Dataproc Get a job
      description: Gets the resource representation for a job in a project.
      operationId: getJob
      parameters:
        - name: project
          in: path
          required: true
          schema:
            type: string
        - name: region
          in: path
          required: true
          schema:
            type: string
        - name: jobId
          in: path
          required: true
          schema:
            type: string
      responses:
        '200':
          description: Successful response
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/Job'
      security:
        - oauth2:
            - https://www.googleapis.com/auth/cloud-platform
    delete:
      tags:
        - Jobs
      summary: Google Cloud Dataproc Delete a job
      description: Deletes the job from the project. After deletion, the job metadata is not retrievable.
      operationId: deleteJob
      parameters:
        - name: project
          in: path
          required: true
          schema:
            type: string
        - name: region
          in: path
          required: true
          schema:
            type: string
        - name: jobId
          in: path
          required: true
          schema:
            type: string
      responses:
        '200':
          description: Job deleted successfully
      security:
        - oauth2:
            - https://www.googleapis.com/auth/cloud-platform
  /projects/{project}/regions/{region}/jobs/{jobId}:cancel:
    post:
      tags:
        - Jobs
      summary: Google Cloud Dataproc Cancel a job
      description: Starts a job cancellation request.
      operationId: cancelJob
      parameters:
        - name: project
          in: path
          required: true
          schema:
            type: string
        - name: region
          in: path
          required: true
          schema:
            type: string
        - name: jobId
          in: path
          required: true
          schema:
            type: string
      responses:
        '200':
          description: Job cancellation initiated
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/Job'
      security:
        - oauth2:
            - https://www.googleapis.com/auth/cloud-platform
  /projects/{project}/regions/{region}/workflowTemplates:
    get:
      tags:
        - WorkflowTemplates
      summary: Google Cloud Dataproc List workflow templates
      description: Lists workflows that match the specified filter in the request.
      operationId: listWorkflowTemplates
      parameters:
        - name: project
          in: path
          required: true
          schema:
            type: string
        - name: region
          in: path
          required: true
          schema:
            type: string
        - name: pageSize
          in: query
          schema:
            type: integer
        - name: pageToken
          in: query
          schema:
            type: string
      responses:
        '200':
          description: Successful response
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/ListWorkflowTemplatesResponse'
      security:
        - oauth2:
            - https://www.googleapis.com/auth/cloud-platform
    post:
      tags:
        - WorkflowTemplates
      summary: Google Cloud Dataproc Create a workflow template
      description: Creates new workflow template.
      operationId: createWorkflowTemplate
      parameters:
        - name: project
          in: path
          required: true
          schema:
            type: string
        - name: region
          in: path
          required: true
          schema:
            type: string
      requestBody:
        required: true
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/WorkflowTemplate'
      responses:
        '200':
          description: Workflow template created
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/WorkflowTemplate'
      security:
        - oauth2:
            - https://www.googleapis.com/auth/cloud-platform
components:
  securitySchemes:
    oauth2:
      type: oauth2
      flows:
        authorizationCode:
          authorizationUrl: https://accounts.google.com/o/oauth2/auth
          tokenUrl: https://oauth2.googleapis.com/token
          scopes:
            https://www.googleapis.com/auth/cloud-platform: Full access to Cloud Platform
  schemas:
    Cluster:
      type: object
      properties:
        projectId:
          type: string
        clusterName:
          type: string
          description: The cluster name, unique within a project.
        config:
          $ref: '#/components/schemas/ClusterConfig'
        labels:
          type: object
          additionalProperties:
            type: string
        status:
          $ref: '#/components/schemas/ClusterStatus'
        statusHistory:
          type: array
          items:
            $ref: '#/components/schemas/ClusterStatus'
        clusterUuid:
          type: string
    ClusterConfig:
      type: object
      properties:
        configBucket:
          type: string
        tempBucket:
          type: string
        gceClusterConfig:
          type: object
          properties:
            zoneUri:
              type: string
            networkUri:
              type: string
            subnetworkUri:
              type: string
            internalIpOnly:
              type: boolean
            serviceAccountScopes:
              type: array
              items:
                type: string
            tags:
              type: array
              items:
                type: string
            metadata:
              type: object
              additionalProperties:
                type: string
        masterConfig:
          $ref: '#/components/schemas/InstanceGroupConfig'
        workerConfig:
          $ref: '#/components/schemas/InstanceGroupConfig'
        secondaryWorkerConfig:
          $ref: '#/components/schemas/InstanceGroupConfig'
        softwareConfig:
          type: object
          properties:
            imageVersion:
              type: string
            properties:
              type: object
              additionalProperties:
                type: string
            optionalComponents:
              type: array
              items:
                type: string
        initializationActions:
          type: array
          items:
            type: object
            properties:
              executableFile:
                type: string
              executionTimeout:
                type: string
    InstanceGroupConfig:
      type: object
      properties:
        numInstances:
          type: integer
        machineTypeUri:
          type: string
        diskConfig:
          type: object
          properties:
            bootDiskType:
              type: string
            bootDiskSizeGb:
              type: integer
            numLocalSsds:
              type: integer
        imageUri:
          type: string
        preemptibility:
          type: string
          enum:
            - PREEMPTIBILITY_UNSPECIFIED
            - NON_PREEMPTIBLE
            - PREEMPTIBLE
            - SPOT
    ClusterStatus:
      type: object
      properties:
        state:
          type: string
          enum:
            - UNKNOWN
            - CREATING
            - RUNNING
            - ERROR
            - ERROR_DUE_TO_UPDATE
            - DELETING
            - UPDATING
            - STOPPING
            - STOPPED
            - STARTING
            - REPAIRING
        stateStartTime:
          type: string
          format: date-time
        detail:
          type: string
    Job:
      type: object
      properties:
        reference:
          type: object
          properties:
            projectId:
              type: string
            jobId:
              type: string
        placement:
          type: object
          properties:
            clusterName:
              type: string
            clusterUuid:
              type: string
        hadoopJob:
          type: object
          properties:
            mainJarFileUri:
              type: string
            mainClass:
              type: string
            args:
              type: array
              items:
                type: string
            jarFileUris:
              type: array
              items:
                type: string
            properties:
              type: object
              additionalProperties:
                type: string
        sparkJob:
          type: object
          properties:
            mainJarFileUri:
              type: string
            mainClass:
              type: string
            args:
              type: array
              items:
                type: string
            jarFileUris:
              type: array
              items:
                type: string
            properties:
              type: object
              additionalProperties:
                type: string
        pysparkJob:
          type: object
          properties:
            mainPythonFileUri:
              type: string
            args:
              type: array
              items:
                type: string
            pythonFileUris:
              type: array
              items:
                type: string
            properties:
              type: object
              additionalProperties:
                type: string
        hiveJob:
          type: object
          properties:
            queryFileUri:
              type: string
            queryList:
              type: object
              properties:
                queries:
                  type: array
                  items:
                    type: string
            continueOnFailure:
              type: boolean
            scriptVariables:
              type: object
              additionalProperties:
                type: string
            properties:
              type: object
              additionalProperties:
                type: string
        pigJob:
          type: object
          properties:
            queryFileUri:
              type: string
            queryList:
              type: object
              properties:
                queries:
                  type: array
                  items:
                    type: string
            continueOnFailure:
              type: boolean
            scriptVariables:
              type: object
              additionalProperties:
                type: string
        status:
          type: object
          properties:
            state:
              type: string
              enum:
                - STATE_UNSPECIFIED
                - PENDING
                - SETUP_DONE
                - RUNNING
                - CANCEL_PENDING
                - CANCEL_STARTED
                - CANCELLED
                - DONE
                - ERROR
                - ATTEMPT_FAILURE
            stateStartTime:
              type: string
              format: date-time
            details:
              type: string
        labels:
          type: object
          additionalProperties:
            type: string
        driverOutputResourceUri:
          type: string
        driverControlFilesUri:
          type: string
        jobUuid:
          type: string
    SubmitJobRequest:
      type: object
      properties:
        job:
          $ref: '#/components/schemas/Job'
        requestId:
          type: string
      required:
        - job
    WorkflowTemplate:
      type: object
      properties:
        id:
          type: string
        name:
          type: string
        version:
          type: integer
        createTime:
          type: string
          format: date-time
        updateTime:
          type: string
          format: date-time
        labels:
          type: object
          additionalProperties:
            type: string
        placement:
          type: object
          properties:
            managedCluster:
              type: object
              properties:
                clusterName:
                  type: string
                config:
                  $ref: '#/components/schemas/ClusterConfig'
            clusterSelector:
              type: object
              properties:
                zone:
                  type: string
                clusterLabels:
                  type: object
                  additionalProperties:
                    type: string
        jobs:
          type: array
          items:
            type: object
            properties:
              stepId:
                type: string
              hadoopJob:
                type: object
              sparkJob:
                type: object
              pysparkJob:
                type: object
              hiveJob:
                type: object
              pigJob:
                type: object
              prerequisiteStepIds:
                type: array
                items:
                  type: string
    ListClustersResponse:
      type: object
      properties:
        clusters:
          type: array
          items:
            $ref: '#/components/schemas/Cluster'
        nextPageToken:
          type: string
    ListJobsResponse:
      type: object
      properties:
        jobs:
          type: array
          items:
            $ref: '#/components/schemas/Job'
        nextPageToken:
          type: string
    ListWorkflowTemplatesResponse:
      type: object
      properties:
        templates:
          type: array
          items:
            $ref: '#/components/schemas/WorkflowTemplate'
        nextPageToken:
          type: string
    Operation:
      type: object
      properties:
        name:
          type: string
        done:
          type: boolean
        metadata:
          type: object
        error:
          type: object
          properties:
            code:
              type: integer
            message:
              type: string
        response:
          type: object