Use original Span class and implement Tokenizer interface from OpenNLP
Using maven assembly plugin to include the required classes into a jar
for binary distribution.
This makes it possible to use KorAPTokenizerImpl as tokenizer in OpenNLP
applications.
Change-Id: I627dcdd046334db1b1ae9dd60cf7104fd3a3c518
diff --git a/Readme.md b/Readme.md
index 66f063a..470d94b 100644
--- a/Readme.md
+++ b/Readme.md
@@ -1,5 +1,5 @@
# KorAP Tokenizer
-Efficient DFA tokenizer with character offset output based on [JFlex](https://www.jflex.de/), suitable for German and other European languages.
+Efficient, [OpenNLP tools](https://opennlp.apache.org) compatible DFA tokenizer with character offset output based on [JFlex](https://www.jflex.de/), suitable for German and other European languages.
## Description
The KorAP tokenizer is used for the German Reference Corpus DeReKo. Being based on a finite state automaton,
@@ -7,6 +7,9 @@
An important feature in the DeReKo/KorAP context is also, that it reliably reports the character offsets of the tokens
so that this information can be used for applying standoff annotations.
+The main class `KorAPTokenizerImpl` implements the [`opennlp.tools.tokenize.Tokenizer`](https://opennlp.apache.org/docs/1.8.2/apidocs/opennlp-tools/opennlp/tools/tokenize/Tokenizer.html)
+interface and can thus be used as a drop-in replacement in OpenNLP applications.
+
The scanner is based on the Lucene scanner with modifications from [David Hall](https://github.com/dlwh).
Our changes mainly concern a good coverage of German abbreviations,
@@ -51,8 +54,7 @@
Corpus Analysis Platform at the Leibniz Institute for German Language
([IDS](http://www.ids-mannheim.de/)).
-The package contains code from [Apache Lucene](https://lucene.apache.org/) with modifications by Jim Hall and code from
-[Apache OpenNLP](https://opennlp.apache.org/) tools (Span class).
+The package contains code from [Apache Lucene](https://lucene.apache.org/) with modifications by Jim Hall.
It is published under the [Apache 2.0 License](LICENSE).
diff --git a/pom.xml b/pom.xml
index 67ac08b..fda28dc 100644
--- a/pom.xml
+++ b/pom.xml
@@ -107,10 +107,20 @@
</configuration>
</plugin>
<plugin>
- <!-- Build an executable JAR -->
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-jar-plugin</artifactId>
- <version>3.1.0</version>
+ <version>3.2.0</version>
+ <executions>
+ <execution>
+ <id>default-jar</id>
+ <!-- use assembly plugin to build the jar -->
+ <phase>none</phase>
+ <configuration>
+ <finalName>unwanted</finalName>
+ <classifier>unwanted</classifier>
+ </configuration>
+ </execution>
+ </executions>
<configuration>
<archive>
<manifest>
@@ -134,10 +144,43 @@
</execution>
</executions>
</plugin>
+
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-assembly-plugin</artifactId>
+ <version>3.3.0</version>
+ <configuration>
+ <appendAssemblyId>false</appendAssemblyId>
+ <descriptors>
+ <descriptor>src/assembly/src-distribution.xml</descriptor>
+ <descriptor>src/assembly/bin-distribution.xml</descriptor>
+ </descriptors>
+ <archive>
+ <manifest>
+ <mainClass>de.ids_mannheim.korap.tokenizer.KorAPTokenizerImpl</mainClass>
+ </manifest>
+ </archive>
+ </configuration>
+ <executions>
+ <execution>
+ <id>make-assembly</id>
+ <phase>package</phase>
+ <goals>
+ <goal>single</goal>
+ </goals>
+ </execution>
+ </executions>
+ </plugin>
</plugins>
</build>
<dependencies>
+ <!-- https://mvnrepository.com/artifact/org.apache.opennlp/opennlp-tools -->
+ <dependency>
+ <groupId>org.apache.opennlp</groupId>
+ <artifactId>opennlp-tools</artifactId>
+ <version>1.9.3</version>
+ </dependency>
<dependency>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
diff --git a/src/assembly/bin-distribution.xml b/src/assembly/bin-distribution.xml
new file mode 100644
index 0000000..5b4aca9
--- /dev/null
+++ b/src/assembly/bin-distribution.xml
@@ -0,0 +1,40 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<assembly xmlns="http://maven.apache.org/ASSEMBLY/2.1.0"
+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/ASSEMBLY/2.1.0 http://maven.apache.org/xsd/assembly-2.1.0.xsd">
+ <id>jar-with-dependencies-and-exclude-classes</id>
+ <formats>
+ <format>jar</format>
+ </formats>
+ <includeBaseDirectory>false</includeBaseDirectory>
+ <dependencySets>
+ <dependencySet>
+ <outputDirectory>/</outputDirectory>
+ <useProjectArtifact>false</useProjectArtifact>
+ <unpack>true</unpack>
+ <unpackOptions>
+ <includes>
+ <include>opennlp/tools/util/Span.class</include>
+ <include>opennlp/tools/tokenize/Tokenizer.class</include>
+ </includes>
+ </unpackOptions>
+ <scope>runtime</scope>
+ </dependencySet>
+ </dependencySets>
+ <fileSets>
+ <fileSet>
+ <directory>${project.basedir}</directory>
+ <outputDirectory>META-INF/</outputDirectory>
+ <includes>
+ <include>README*</include>
+ <include>Readme*</include>
+ <include>LICENSE*</include>
+ </includes>
+ <useDefaultExcludes>true</useDefaultExcludes>
+ </fileSet>
+ <fileSet>
+ <outputDirectory>/</outputDirectory>
+ <directory>${project.build.outputDirectory}</directory>
+ </fileSet>
+ </fileSets>
+</assembly>
\ No newline at end of file
diff --git a/src/assembly/src-distribution.xml b/src/assembly/src-distribution.xml
new file mode 100644
index 0000000..675c51e
--- /dev/null
+++ b/src/assembly/src-distribution.xml
@@ -0,0 +1,26 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<assembly xmlns="http://maven.apache.org/ASSEMBLY/2.1.0"
+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/ASSEMBLY/2.1.0 http://maven.apache.org/xsd/assembly-2.1.0.xsd">
+ <id>src</id>
+ <formats>
+ <format>zip</format>
+ </formats>
+ <fileSets>
+ <fileSet>
+ <directory>${project.basedir}</directory>
+ <includes>
+ <include>README*</include>
+ <include>Readme*</include>
+ <include>LICENSE*</include>
+ <include>NOTICE*</include>
+ <include>pom.xml</include>
+ </includes>
+ <useDefaultExcludes>true</useDefaultExcludes>
+ </fileSet>
+ <fileSet>
+ <directory>${project.build.sourceDirectory}/src</directory>
+ <useDefaultExcludes>true</useDefaultExcludes>
+ </fileSet>
+ </fileSets>
+</assembly>
diff --git a/src/main/java/de/ids_mannheim/korap/tokenizer/NOTICE b/src/main/java/de/ids_mannheim/korap/tokenizer/NOTICE
deleted file mode 100644
index 2ef39b3..0000000
--- a/src/main/java/de/ids_mannheim/korap/tokenizer/NOTICE
+++ /dev/null
@@ -1,2 +0,0 @@
-The Span class contains code from OpenNLP developed at
-The Apache Software Foundation (http://www.apache.org/).
diff --git a/src/main/java/de/ids_mannheim/korap/tokenizer/Span.java b/src/main/java/de/ids_mannheim/korap/tokenizer/Span.java
deleted file mode 100644
index 7d9429d..0000000
--- a/src/main/java/de/ids_mannheim/korap/tokenizer/Span.java
+++ /dev/null
@@ -1,179 +0,0 @@
-package de.ids_mannheim.korap.tokenizer;
-
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-
-import java.io.Serializable;
-import java.util.Objects;
-
-
-/**
- * Class for storing start and end integer offsets.
- *
- */
-public class Span implements Comparable<Span>, Serializable {
-
- private final int start;
- private final int end;
- private final double prob;//default is 0
- private final String type;
-
- /**
- * Initializes a new Span Object. Sets the prob to 0 as default.
- *
- * @param s start of span.
- * @param e end of span, which is +1 more than the last element in the span.
- * @param type the type of the span
- */
- public Span(int s, int e, String type) {
- this(s, e, type, 0d);
- }
-
- /**
- * Initializes a new Span Object.
- *
- * @param s start of span.
- * @param e end of span, which is +1 more than the last element in the span.
- * @param type the type of the span
- * @param prob probability of span.
- */
- public Span(int s, int e, String type, double prob) {
-
- if (s < 0) {
- throw new IllegalArgumentException("start index must be zero or greater: " + s);
- }
- if (e < 0) {
- throw new IllegalArgumentException("end index must be zero or greater: " + e);
- }
- if (s > e) {
- throw new IllegalArgumentException(
- "start index must not be larger than end index: " + "start=" + s + ", end=" + e);
- }
-
- start = s;
- end = e;
- this.prob = prob;
- this.type = type;
- }
-
- /**
- * Return the start of a span.
- *
- * @return the start of a span.
- *
- */
- public int getStart() {
- return start;
- }
-
- /**
- * Return the end of a span.
- *
- * Note: that the returned index is one past the actual end of the span in the
- * text, or the first element past the end of the span.
- *
- * @return the end of a span.
- *
- */
- public int getEnd() {
- return end;
- }
-
- /**
- * Retrieves the type of the span.
- *
- * @return the type or null if not set
- */
- public String getType() {
- return type;
- }
-
- /**
- * Compares the specified span to the current span.
- */
- public int compareTo(Span s) {
- if (getStart() < s.getStart()) {
- return -1;
- } else if (getStart() == s.getStart()) {
- if (getEnd() > s.getEnd()) {
- return -1;
- } else if (getEnd() < s.getEnd()) {
- return 1;
- } else {
- // compare the type
- if (getType() == null && s.getType() == null) {
- return 0;
- } else if (getType() != null && s.getType() != null) {
- // use type lexicography order
- return getType().compareTo(s.getType());
- } else if (getType() != null) {
- return -1;
- }
- return 1;
- }
- } else {
- return 1;
- }
- }
-
- /**
- * Generates a hash code of the current span.
- */
- @Override
- public int hashCode() {
- return Objects.hash(getStart(), getEnd(), getType());
- }
-
- /**
- * Checks if the specified span is equal to the current span.
- */
- @Override
- public boolean equals(Object o) {
- if (o == this) {
- return true;
- }
-
- if (o instanceof Span) {
- Span s = (Span) o;
-
- return getStart() == s.getStart() && getEnd() == s.getEnd() && Objects.equals(getType(), s.getType());
- }
-
- return false;
- }
-
- /**
- * Generates a human readable string.
- */
- @Override
- public String toString() {
- StringBuilder toStringBuffer = new StringBuilder(15);
- toStringBuffer.append("[");
- toStringBuffer.append(getStart());
- toStringBuffer.append("..");
- toStringBuffer.append(getEnd());
- toStringBuffer.append(")");
- if (getType() != null) {
- toStringBuffer.append(" ");
- toStringBuffer.append(getType());
- }
-
- return toStringBuffer.toString();
- }
-
-}
diff --git a/src/main/jflex/de/ids_mannheim/korap/tokenizer/KorAPTokenizerImpl.jflex b/src/main/jflex/de/ids_mannheim/korap/tokenizer/KorAPTokenizerImpl.jflex
index 584f0cb..ca51178 100644
--- a/src/main/jflex/de/ids_mannheim/korap/tokenizer/KorAPTokenizerImpl.jflex
+++ b/src/main/jflex/de/ids_mannheim/korap/tokenizer/KorAPTokenizerImpl.jflex
@@ -56,7 +56,7 @@
import java.lang.StringBuffer;
import java.util.ArrayList;
import java.util.List;
-// import Span;
+import opennlp.tools.util.Span;
%%
/**
@@ -67,7 +67,7 @@
%class KorAPTokenizerImpl
%unicode
%public
-// %implements opennlp.tools.tokenize.Tokenizer
+%implements opennlp.tools.tokenize.Tokenizer
%type Span
%function getNextToken
%char
diff --git a/target/generated-sources/jflex/de/ids_mannheim/korap/tokenizer/KorAPTokenizerImpl.java b/target/generated-sources/jflex/de/ids_mannheim/korap/tokenizer/KorAPTokenizerImpl.java
index 71726c8..5711417 100644
--- a/target/generated-sources/jflex/de/ids_mannheim/korap/tokenizer/KorAPTokenizerImpl.java
+++ b/target/generated-sources/jflex/de/ids_mannheim/korap/tokenizer/KorAPTokenizerImpl.java
@@ -60,11 +60,11 @@
import java.lang.StringBuffer;
import java.util.ArrayList;
import java.util.List;
-// import Span;
+import opennlp.tools.util.Span;
// See https://github.com/jflex-de/jflex/issues/222
@SuppressWarnings("FallThrough")
-public class KorAPTokenizerImpl {
+public class KorAPTokenizerImpl implements opennlp.tools.tokenize.Tokenizer {
/** This character denotes the end of file. */
public static final int YYEOF = -1;