Semantic Scholar Datasets API

Provides downloadable bulk snapshots of the full Semantic Scholar academic graph, updated monthly. Supports incremental diff downloads between releases. Requires an API key for authentication.

OpenAPI Specification

semantic-scholar-datasets-openapi.yml Raw ↑
openapi: 3.0.0
info:
  title: S2AG Datasets
  version: '1.0'
  description: "Download full-corpus datasets from the Semantic Scholar Academic Graph\
    \ (S2AG)\n    <br><br>\n    Some python demonstrating usage of the datasets API:\n\
    \    \n    r1 = requests.get('https://api.semanticscholar.org/datasets/v1/release').json()\n\
    \    print(r1[-3:])\n    ['2023-03-14', '2023-03-21', '2023-03-28']\n\n    r2\
    \ = requests.get('https://api.semanticscholar.org/datasets/v1/release/latest').json()\n\
    \    print(r2['release_id'])\n    2023-03-28\n\n    print(json.dumps(r2['datasets'][0],\
    \ indent=2))\n    {\n        \"name\": \"abstracts\",\n        \"description\"\
    : \"Paper abstract text, where available. 100M records in 30 1.8GB files.\",\n\
    \        \"README\": \"Semantic Scholar Academic Graph Datasets The \"abstracts\"\
    \ dataset provides...\"\n    }\n\n    r3 = requests.get('https://api.semanticscholar.org/datasets/v1/release/latest/dataset/abstracts').json()\n\
    \    print(json.dumps(r3, indent=2))\n    {\n      \"name\": \"abstracts\",\n\
    \      \"description\": \"Paper abstract text, where available. 100M records in\
    \ 30 1.8GB files.\",\n      \"README\": \"Semantic Scholar Academic Graph Datasets\
    \ The \"abstracts\" dataset provides...\",\n      \"files\": [\n        \"https://ai2-s2ag.s3.amazonaws.com/dev/staging/2023-03-28/abstracts/20230331_0...\"\
    \n      ]\n    }\n        "
servers:
- url: https://api.semanticscholar.org/datasets/v1
paths:
  /diffs/{start_release_id}/to/{end_release_id}/{dataset_name}:
    get:
      summary: Download Links for Incremental Diffs
      operationId: get_diff
      tags:
      - Incremental Updates
      description: "Full datasets can be updated from one release to another to avoid\n\
        downloading and processing data that hasn't changed. This method returns\n\
        a list of all the \"diffs\" that are required to catch a given dataset up\n\
        from its current release to a newer one.\n\nEach \"diff\" represents changes\
        \ between two sequential releases, and\ncontains two lists of files, an \"\
        updated\" list and a \"deleted\" list.\nRecords in the \"updated\" list need\
        \ to be inserted or replaced by their\nprimary key. Records in the \"deleted\"\
        \ list should be removed.\n\nExample code for updating a database or key/value\
        \ store:\n\n    difflist = requests.get('https://api.semanticscholar.org/datasets/v1/diffs/2023-08-01/to/latest/papers').json()\n\
        \    for diff in difflist['diffs']:\n        for url in diff['update_files']:\n\
        \            for json_line in requests.get(url).iter_lines():\n          \
        \      record = json.loads(json_line)\n                datastore.upsert(record['corpusid'],\
        \ record)\n        for url in diff['delete_files']:\n            for json_line\
        \ in requests.get(url).iter_lines():\n                record = json.loads(json_line)\n\
        \                datastore.delete(record['corpusid'])\n\nExample code for\
        \ updating via a join in Spark:\n\n    current = sc.textFile('s3://curr-dataset-location').map(json.loads).keyBy(lambda\
        \ x: x['corpusid'])\n    updates = sc.textFile('s3://diff-updates-location').map(json.loads).keyBy(lambda\
        \ x: x['corpusid'])\n    deletes = sc.textFile('s3://diff-deletes-location').map(json.loads).keyBy(lambda\
        \ x: x['corpusid'])\n\n    updated = current.fullOuterJoin(updates).mapValues(lambda\
        \ x: x[1] if x[1] is not None else x[0])\n    updated = updated.fullOuterJoin(deletes).mapValues(lambda\
        \ x: None if x[1] is not None else x[0]).filter(lambda x: x[1] is not None)\n\
        \    updated.values().map(json.dumps).saveAsTextFile('s3://updated-dataset-location')"
      responses:
        '200':
          description: List of download links for one dataset between given releases
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/Dataset%20Diff%20List'
  /release/:
    get:
      summary: List of Available Releases
      operationId: get_releases
      tags:
      - Release Data
      description: Releases are identified by a date stamp such as "2023-08-01". Each
        release contains full data for each dataset.
      responses:
        '200':
          description: List of Available Releases
          content:
            application/json:
              schema:
                type: array
                items:
                  type: string
                  description: Release ids in the form of date stamps
                  example: '2022-01-17'
  /release/{release_id}:
    get:
      summary: List of Datasets in a Release
      operationId: get_release
      tags:
      - Release Data
      description: Metadata describing a particular release, including a list of datasets
        available.
      responses:
        '200':
          description: Contents of the release with the given ID
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/Release%20Metadata'
  /release/{release_id}/dataset/{dataset_name}:
    get:
      summary: Download Links for a Dataset
      operationId: get_dataset
      tags:
      - Release Data
      description: 'Datasets are partitioned and stored on S3. Clients can retrieve
        them by requesting this list

        of pre-signed download urls and fetching all the partitions.'
      responses:
        '200':
          description: Description and download links for the given dataset within
            the given release
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/Dataset%20Metadata'
components:
  schemas:
    Release Metadata:
      properties:
        release_id:
          type: string
          example: '2022-01-17'
        README:
          type: string
          description: License and usage
          example: Subject to the following terms ...
        datasets:
          type: array
          description: Dataset metadata
          items:
            $ref: '#/components/schemas/Dataset Summary'
      type: object
    Dataset Summary:
      properties:
        name:
          type: string
          description: Dataset name
          example: papers
        description:
          type: string
          description: Description of the data in the dataset
          example: Core paper metadata
        README:
          type: string
          description: Documentation and attribution for the dataset
          example: This dataset contains ...
      type: object
    Dataset Metadata:
      properties:
        name:
          type: string
          description: Name of the dataset
          example: papers
        description:
          type: string
          description: Description of the data contained in this dataset.
          example: Core paper metadata
        README:
          type: string
          description: License and usage
          example: Subject to terms of use as follows ...
        files:
          type: array
          description: Temporary, pre-signed download links for dataset files
          items:
            type: string
            example: https://...
      type: object
    Dataset Diff List:
      properties:
        dataset:
          type: string
          description: Dataset these diffs are for.
          example: papers
        start_release:
          type: string
          description: Beginning release, i.e. the release currently held by the client.
          example: '2023-08-01'
        end_release:
          type: string
          description: Ending release, i.e. the release the client wants to update
            to.
          example: '2023-08-29'
        diffs:
          type: array
          description: List of diffs that need to be applied to bring the dataset
            at 'start_release' up to date with 'end_release'.
          items:
            $ref: '#/components/schemas/Dataset Diff'
      type: object
    Dataset Diff:
      properties:
        from_release:
          type: string
          description: Basline release for this diff.
          example: '2023-08-01'
        to_release:
          type: string
          description: Target release for this diff.
          example: '2023-08-07'
        update_files:
          type: array
          description: List of files that contain updates to the dataset. Each record
            in these files needs to be insterted or updated.
          items:
            type: string
            example: http://...
        delete_files:
          type: array
          description: List of files that contain deletes from the dataset. Each record
            in these files needs to be deleted.
          items:
            type: string
            example: http://...
      type: object
  securitySchemes: {}