Apache OpenNLP

OpenNLP provides a Java API for NLP tasks including tokenization, sentence detection, POS tagging, named entity recognition, chunking, parsing, and language detection, with support for training custom models.

Documentation

Specifications

Other Resources

OpenAPI Specification

apache-opennlp-tools.yaml Raw ↑
openapi: 3.0.3
info:
  title: Apache OpenNLP Tools API
  description: Apache OpenNLP is a machine learning based toolkit for processing natural language text, supporting tokenization, sentence segmentation, POS tagging, named entity extraction, chunking, parsing, and coreference resolution. This API represents the REST-accessible surface of the OpenNLP toolkit.
  version: 2.5.8
  license:
    name: Apache 2.0
    url: https://www.apache.org/licenses/LICENSE-2.0
  contact:
    url: https://opennlp.apache.org/
x-generated-from: documentation
servers:
  - url: https://{host}/opennlp
    description: Apache OpenNLP REST service
    variables:
      host:
        default: localhost:8080

paths:
  /detect/language:
    post:
      operationId: detectLanguage
      summary: Apache OpenNLP Detect Language
      description: Detect the language of the provided text using the language detector model.
      tags: [Language Detection]
      requestBody:
        required: true
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/TextRequest'
      responses:
        '200':
          description: Language detection result
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/LanguageDetectionResult'
      x-microcks-operation:
        delay: 0
        dispatcher: FALLBACK

  /detect/sentences:
    post:
      operationId: detectSentences
      summary: Apache OpenNLP Detect Sentences
      description: Split input text into individual sentences using the sentence detector model.
      tags: [Sentence Detection]
      requestBody:
        required: true
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/TextRequest'
      responses:
        '200':
          description: Sentence detection result
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/SentenceDetectionResult'
      x-microcks-operation:
        delay: 0
        dispatcher: FALLBACK

  /tokenize:
    post:
      operationId: tokenize
      summary: Apache OpenNLP Tokenize Text
      description: Segment input text into individual tokens (words, punctuation) using the tokenizer model.
      tags: [Tokenization]
      requestBody:
        required: true
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/TextRequest'
      responses:
        '200':
          description: Tokenization result
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/TokenizationResult'
      x-microcks-operation:
        delay: 0
        dispatcher: FALLBACK

  /ner:
    post:
      operationId: findNamedEntities
      summary: Apache OpenNLP Find Named Entities
      description: Detect and classify named entities (persons, locations, organizations, dates) in tokenized text.
      tags: [Named Entity Recognition]
      requestBody:
        required: true
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/TokensRequest'
      responses:
        '200':
          description: Named entity recognition result
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/NERResult'
      x-microcks-operation:
        delay: 0
        dispatcher: FALLBACK

  /pos/tag:
    post:
      operationId: tagPartsOfSpeech
      summary: Apache OpenNLP Tag Parts of Speech
      description: Assign POS tags (noun, verb, adjective, etc.) to each token in the tokenized text.
      tags: [POS Tagging]
      requestBody:
        required: true
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/TokensRequest'
      responses:
        '200':
          description: POS tagging result
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/POSTaggingResult'
      x-microcks-operation:
        delay: 0
        dispatcher: FALLBACK

  /lemmatize:
    post:
      operationId: lemmatize
      summary: Apache OpenNLP Lemmatize Text
      description: Reduce tokens to their base/lemma forms using POS context.
      tags: [Lemmatization]
      requestBody:
        required: true
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/POSTokensRequest'
      responses:
        '200':
          description: Lemmatization result
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/LemmatizationResult'
      x-microcks-operation:
        delay: 0
        dispatcher: FALLBACK

  /chunk:
    post:
      operationId: chunkText
      summary: Apache OpenNLP Chunk Text
      description: Identify shallow syntactic phrases (noun phrases, verb phrases) in tokenized text with POS tags.
      tags: [Chunking]
      requestBody:
        required: true
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/POSTokensRequest'
      responses:
        '200':
          description: Chunking result
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/ChunkingResult'
      x-microcks-operation:
        delay: 0
        dispatcher: FALLBACK

  /parse:
    post:
      operationId: parseText
      summary: Apache OpenNLP Parse Text
      description: Perform full syntactic parsing to build a parse tree for a sentence.
      tags: [Parsing]
      requestBody:
        required: true
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/TextRequest'
      responses:
        '200':
          description: Parse tree result
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/ParseResult'
      x-microcks-operation:
        delay: 0
        dispatcher: FALLBACK

  /categorize:
    post:
      operationId: categorizeDocument
      summary: Apache OpenNLP Categorize Document
      description: Classify a document into predefined categories using the document categorizer model.
      tags: [Document Categorization]
      requestBody:
        required: true
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/TextRequest'
      responses:
        '200':
          description: Document categorization result
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/CategorizationResult'
      x-microcks-operation:
        delay: 0
        dispatcher: FALLBACK

  /models:
    get:
      operationId: listModels
      summary: Apache OpenNLP List Available Models
      description: List all available NLP models loaded in the OpenNLP service.
      tags: [Models]
      responses:
        '200':
          description: List of available models
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/ModelList'
      x-microcks-operation:
        delay: 0
        dispatcher: FALLBACK

  /models/{modelId}:
    get:
      operationId: getModel
      summary: Apache OpenNLP Get Model
      description: Get metadata about a specific NLP model.
      tags: [Models]
      parameters:
        - name: modelId
          in: path
          required: true
          description: Unique model identifier
          schema:
            type: string
            example: en-ner-person.bin
      responses:
        '200':
          description: Model metadata
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/ModelInfo'
        '404':
          description: Model not found
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/ErrorResponse'
      x-microcks-operation:
        delay: 0
        dispatcher: FALLBACK

components:
  schemas:
    TextRequest:
      type: object
      required: [text]
      properties:
        text:
          type: string
          description: Input text to process
          example: Pierre Vinken, 61 years old, will join the board as a nonexecutive director Nov. 29.
        language:
          type: string
          description: ISO-639-3 language code hint
          example: eng
        modelId:
          type: string
          description: Specific model to use (optional)
          example: en-sent.bin

    TokensRequest:
      type: object
      required: [tokens]
      properties:
        tokens:
          type: array
          items:
            type: string
          description: Pre-tokenized array of text tokens
          example: ["Pierre", "Vinken", "will", "join", "the", "board"]
        language:
          type: string
          description: ISO-639-3 language code
          example: eng

    POSTokensRequest:
      type: object
      required: [tokens, posTags]
      properties:
        tokens:
          type: array
          items:
            type: string
          description: Pre-tokenized array of text tokens
          example: ["Pierre", "Vinken", "will", "join"]
        posTags:
          type: array
          items:
            type: string
          description: POS tags for each token
          example: ["NNP", "NNP", "MD", "VB"]

    LanguageDetectionResult:
      type: object
      properties:
        bestLanguage:
          type: string
          description: ISO-639-3 code of most likely language
          example: eng
        confidence:
          type: number
          description: Confidence score 0-1
          example: 0.98
        languages:
          type: array
          description: All detected languages with probabilities
          items:
            $ref: '#/components/schemas/LanguageProbability'

    LanguageProbability:
      type: object
      properties:
        language:
          type: string
          description: ISO-639-3 language code
          example: eng
        probability:
          type: number
          description: Probability score
          example: 0.98

    SentenceDetectionResult:
      type: object
      properties:
        sentences:
          type: array
          items:
            type: string
          description: Detected sentences
          example: ["Pierre Vinken will join the board.", "He is 61 years old."]
        spans:
          type: array
          items:
            $ref: '#/components/schemas/Span'

    TokenizationResult:
      type: object
      properties:
        tokens:
          type: array
          items:
            type: string
          description: Extracted tokens
          example: ["Pierre", "Vinken", ",", "61", "years", "old"]
        spans:
          type: array
          items:
            $ref: '#/components/schemas/Span'
        probabilities:
          type: array
          items:
            type: number
          description: Confidence for each token boundary

    NERResult:
      type: object
      properties:
        entities:
          type: array
          items:
            $ref: '#/components/schemas/NamedEntity'

    NamedEntity:
      type: object
      properties:
        text:
          type: string
          description: Entity text
          example: Pierre Vinken
        type:
          type: string
          description: Entity type
          example: person
          enum: [person, location, organization, date, time, money, percent, misc]
        start:
          type: integer
          description: Start token index
          example: 0
        end:
          type: integer
          description: End token index (exclusive)
          example: 2
        probability:
          type: number
          description: Confidence score
          example: 0.95

    POSTaggingResult:
      type: object
      properties:
        tokens:
          type: array
          items:
            type: string
          example: ["Pierre", "Vinken", "will", "join"]
        tags:
          type: array
          items:
            type: string
          description: POS tags (Penn Treebank tagset)
          example: ["NNP", "NNP", "MD", "VB"]
        probabilities:
          type: array
          items:
            type: number

    LemmatizationResult:
      type: object
      properties:
        tokens:
          type: array
          items:
            type: string
          example: ["running", "faster"]
        lemmas:
          type: array
          items:
            type: string
          example: ["run", "fast"]

    ChunkingResult:
      type: object
      properties:
        chunks:
          type: array
          items:
            $ref: '#/components/schemas/Chunk'

    Chunk:
      type: object
      properties:
        text:
          type: string
          description: Chunk text
          example: Pierre Vinken
        type:
          type: string
          description: Chunk type
          example: NP
          enum: [NP, VP, PP, ADJP, ADVP, SBAR, PRT, CONJP, INTJ, LST, UCP]
        start:
          type: integer
          example: 0
        end:
          type: integer
          example: 2

    ParseResult:
      type: object
      properties:
        parseTree:
          type: string
          description: Penn Treebank-style parse tree
          example: "(S (NP Pierre Vinken) (VP will join (NP the board)))"
        probability:
          type: number
          description: Parse probability
          example: 0.87

    CategorizationResult:
      type: object
      properties:
        bestCategory:
          type: string
          description: Most likely category label
          example: Sports
        probabilities:
          type: object
          additionalProperties:
            type: number
          description: Probability for each category

    ModelList:
      type: object
      properties:
        models:
          type: array
          items:
            $ref: '#/components/schemas/ModelInfo'

    ModelInfo:
      type: object
      properties:
        modelId:
          type: string
          description: Unique model identifier
          example: en-ner-person.bin
        language:
          type: string
          description: Model language
          example: eng
        type:
          type: string
          description: Model type
          example: TokenNameFinder
        version:
          type: string
          description: Model version
          example: 1.5
        loaded:
          type: boolean
          description: Whether model is currently loaded
          example: true

    Span:
      type: object
      properties:
        start:
          type: integer
          description: Start character offset
          example: 0
        end:
          type: integer
          description: End character offset (exclusive)
          example: 13
        type:
          type: string
          description: Span type if applicable
          example: person

    ErrorResponse:
      type: object
      properties:
        error:
          type: string
          description: Error message
          example: Model not found
        code:
          type: integer
          example: 404