Use original Span class and implement Tokenizer interface from OpenNLP Using maven assembly plugin to include the required classes into a jar for binary distribution. This makes it possible to use KorAPTokenizerImpl as tokenizer in OpenNLP applications. Change-Id: I627dcdd046334db1b1ae9dd60cf7104fd3a3c518

commit: 81925096b645bf17cf688d1311bce002b5a9418f [log] [tgz]
author: Marc Kupietz <kupietz@ids-mannheim.de> Mon Sep 07 15:16:51 2020 +0200
committer: Marc K <morckx@gmail.com> Mon Sep 07 15:25:28 2020 +0200
tree: 72e8930b1290ff3567cd9987d7e425d9b5e4c580
parent: 478632eb7e133969531f3b401576564783183707 [diff]
diff --git a/Readme.md b/Readme.md
index 66f063a..470d94b 100644
--- a/Readme.md
+++ b/Readme.md

@@ -1,5 +1,5 @@
 # KorAP Tokenizer
-Efficient DFA tokenizer with character offset output based on [JFlex](https://www.jflex.de/), suitable for German and other European languages. 
+Efficient, [OpenNLP tools](https://opennlp.apache.org) compatible DFA tokenizer with character offset output based on [JFlex](https://www.jflex.de/), suitable for German and other European languages.
 
 ## Description
 The KorAP tokenizer is used for the German Reference Corpus DeReKo. Being based on a finite state automaton, 
@@ -7,6 +7,9 @@
 An important feature in the DeReKo/KorAP context is also, that it reliably reports the character offsets of the tokens 
 so that this information can be used for applying standoff annotations.
  
+The main class `KorAPTokenizerImpl` implements the [`opennlp.tools.tokenize.Tokenizer`](https://opennlp.apache.org/docs/1.8.2/apidocs/opennlp-tools/opennlp/tools/tokenize/Tokenizer.html)
+interface and can thus be used as a drop-in replacement in OpenNLP applications.
+
 The scanner is based on the Lucene scanner with modifications from [David Hall](https://github.com/dlwh).  
 
 Our changes mainly concern a good coverage of German abbreviations, 
@@ -51,8 +54,7 @@
 Corpus Analysis Platform at the Leibniz Institute for German Language
 ([IDS](http://www.ids-mannheim.de/)).
 
-The package contains code from [Apache Lucene](https://lucene.apache.org/) with modifications by Jim Hall and code from 
-[Apache OpenNLP](https://opennlp.apache.org/) tools (Span class).
+The package contains code from [Apache Lucene](https://lucene.apache.org/) with modifications by Jim Hall.
 
 It is published under the [Apache 2.0 License](LICENSE).
 

diff --git a/pom.xml b/pom.xml
index 67ac08b..fda28dc 100644
--- a/pom.xml
+++ b/pom.xml

@@ -107,10 +107,20 @@
                 </configuration>
             </plugin>
             <plugin>
-                <!-- Build an executable JAR -->
                 <groupId>org.apache.maven.plugins</groupId>
                 <artifactId>maven-jar-plugin</artifactId>
-                <version>3.1.0</version>
+                <version>3.2.0</version>
+                <executions>
+                    <execution>
+                        <id>default-jar</id>
+                        <!-- use assembly plugin to build the jar -->
+                        <phase>none</phase>
+                        <configuration>
+                            <finalName>unwanted</finalName>
+                            <classifier>unwanted</classifier>
+                        </configuration>
+                    </execution>
+                </executions>
                 <configuration>
                     <archive>
                         <manifest>
@@ -134,10 +144,43 @@
                     </execution>
                 </executions>
             </plugin>
+
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-assembly-plugin</artifactId>
+                <version>3.3.0</version>
+                <configuration>
+                    <appendAssemblyId>false</appendAssemblyId>
+                    <descriptors>
+                        <descriptor>src/assembly/src-distribution.xml</descriptor>
+                        <descriptor>src/assembly/bin-distribution.xml</descriptor>
+                    </descriptors>
+                    <archive>
+                        <manifest>
+                            <mainClass>de.ids_mannheim.korap.tokenizer.KorAPTokenizerImpl</mainClass>
+                        </manifest>
+                    </archive>
+                </configuration>
+                <executions>
+                    <execution>
+                        <id>make-assembly</id>
+                        <phase>package</phase>
+                        <goals>
+                            <goal>single</goal>
+                        </goals>
+                    </execution>
+                </executions>
+            </plugin>
         </plugins>
     </build>
 
     <dependencies>
+        <!-- https://mvnrepository.com/artifact/org.apache.opennlp/opennlp-tools -->
+        <dependency>
+            <groupId>org.apache.opennlp</groupId>
+            <artifactId>opennlp-tools</artifactId>
+            <version>1.9.3</version>
+        </dependency>
         <dependency>
             <groupId>org.apache.maven.plugins</groupId>
             <artifactId>maven-surefire-plugin</artifactId>

diff --git a/src/assembly/bin-distribution.xml b/src/assembly/bin-distribution.xml
new file mode 100644
index 0000000..5b4aca9
--- /dev/null
+++ b/src/assembly/bin-distribution.xml

@@ -0,0 +1,40 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<assembly xmlns="http://maven.apache.org/ASSEMBLY/2.1.0"
+          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+          xsi:schemaLocation="http://maven.apache.org/ASSEMBLY/2.1.0 http://maven.apache.org/xsd/assembly-2.1.0.xsd">
+    <id>jar-with-dependencies-and-exclude-classes</id>
+    <formats>
+        <format>jar</format>
+    </formats>
+    <includeBaseDirectory>false</includeBaseDirectory>
+    <dependencySets>
+        <dependencySet>
+            <outputDirectory>/</outputDirectory>
+            <useProjectArtifact>false</useProjectArtifact>
+            <unpack>true</unpack>
+            <unpackOptions>
+                <includes>
+                    <include>opennlp/tools/util/Span.class</include>
+                    <include>opennlp/tools/tokenize/Tokenizer.class</include>
+                </includes>
+            </unpackOptions>
+            <scope>runtime</scope>
+        </dependencySet>
+    </dependencySets>
+    <fileSets>
+        <fileSet>
+            <directory>${project.basedir}</directory>
+            <outputDirectory>META-INF/</outputDirectory>
+            <includes>
+                <include>README*</include>
+                <include>Readme*</include>
+                <include>LICENSE*</include>
+            </includes>
+            <useDefaultExcludes>true</useDefaultExcludes>
+        </fileSet>
+        <fileSet>
+            <outputDirectory>/</outputDirectory>
+            <directory>${project.build.outputDirectory}</directory>
+        </fileSet>
+    </fileSets>
+</assembly>
\ No newline at end of file

diff --git a/src/assembly/src-distribution.xml b/src/assembly/src-distribution.xml
new file mode 100644
index 0000000..675c51e
--- /dev/null
+++ b/src/assembly/src-distribution.xml

@@ -0,0 +1,26 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<assembly xmlns="http://maven.apache.org/ASSEMBLY/2.1.0"
+          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+          xsi:schemaLocation="http://maven.apache.org/ASSEMBLY/2.1.0 http://maven.apache.org/xsd/assembly-2.1.0.xsd">
+    <id>src</id>
+    <formats>
+        <format>zip</format>
+    </formats>
+    <fileSets>
+        <fileSet>
+            <directory>${project.basedir}</directory>
+            <includes>
+                <include>README*</include>
+                <include>Readme*</include>
+                <include>LICENSE*</include>
+                <include>NOTICE*</include>
+                <include>pom.xml</include>
+            </includes>
+            <useDefaultExcludes>true</useDefaultExcludes>
+        </fileSet>
+        <fileSet>
+            <directory>${project.build.sourceDirectory}/src</directory>
+            <useDefaultExcludes>true</useDefaultExcludes>
+        </fileSet>
+    </fileSets>
+</assembly>

diff --git a/src/main/java/de/ids_mannheim/korap/tokenizer/NOTICE b/src/main/java/de/ids_mannheim/korap/tokenizer/NOTICE
deleted file mode 100644
index 2ef39b3..0000000
--- a/src/main/java/de/ids_mannheim/korap/tokenizer/NOTICE
+++ /dev/null

@@ -1,2 +0,0 @@
-The Span class contains code from OpenNLP developed at
-The Apache Software Foundation (http://www.apache.org/).

diff --git a/src/main/java/de/ids_mannheim/korap/tokenizer/Span.java b/src/main/java/de/ids_mannheim/korap/tokenizer/Span.java
deleted file mode 100644
index 7d9429d..0000000
--- a/src/main/java/de/ids_mannheim/korap/tokenizer/Span.java
+++ /dev/null

@@ -1,179 +0,0 @@
-package de.ids_mannheim.korap.tokenizer;
-
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-
-import java.io.Serializable;
-import java.util.Objects;
-
-
-/**
- * Class for storing start and end integer offsets.
- *
- */
-public class Span implements Comparable<Span>, Serializable {
-
-    private final int start;
-    private final int end;
-    private final double prob;//default is 0
-    private final String type;
-
-    /**
-     * Initializes a new Span Object. Sets the prob to 0 as default.
-     *
-     * @param s start of span.
-     * @param e end of span, which is +1 more than the last element in the span.
-     * @param type the type of the span
-     */
-    public Span(int s, int e, String type) {
-        this(s, e, type, 0d);
-    }
-
-    /**
-     * Initializes a new Span Object.
-     *
-     * @param s start of span.
-     * @param e end of span, which is +1 more than the last element in the span.
-     * @param type the type of the span
-     * @param prob probability of span.
-     */
-    public Span(int s, int e, String type, double prob) {
-
-        if (s < 0) {
-            throw new IllegalArgumentException("start index must be zero or greater: " + s);
-        }
-        if (e < 0) {
-            throw new IllegalArgumentException("end index must be zero or greater: " + e);
-        }
-        if (s > e) {
-            throw new IllegalArgumentException(
-                    "start index must not be larger than end index: " + "start=" + s + ", end=" + e);
-        }
-
-        start = s;
-        end = e;
-        this.prob = prob;
-        this.type = type;
-    }
-
-    /**
-     * Return the start of a span.
-     *
-     * @return the start of a span.
-     *
-     */
-    public int getStart() {
-        return start;
-    }
-
-    /**
-     * Return the end of a span.
-     *
-     * Note: that the returned index is one past the actual end of the span in the
-     * text, or the first element past the end of the span.
-     *
-     * @return the end of a span.
-     *
-     */
-    public int getEnd() {
-        return end;
-    }
-
-    /**
-     * Retrieves the type of the span.
-     *
-     * @return the type or null if not set
-     */
-    public String getType() {
-        return type;
-    }
-
-    /**
-     * Compares the specified span to the current span.
-     */
-    public int compareTo(Span s) {
-        if (getStart() < s.getStart()) {
-            return -1;
-        } else if (getStart() == s.getStart()) {
-            if (getEnd() > s.getEnd()) {
-                return -1;
-            } else if (getEnd() < s.getEnd()) {
-                return 1;
-            } else {
-                // compare the type
-                if (getType() == null && s.getType() == null) {
-                    return 0;
-                } else if (getType() != null && s.getType() != null) {
-                    // use type lexicography order
-                    return getType().compareTo(s.getType());
-                } else if (getType() != null) {
-                    return -1;
-                }
-                return 1;
-            }
-        } else {
-            return 1;
-        }
-    }
-
-    /**
-     * Generates a hash code of the current span.
-     */
-    @Override
-    public int hashCode() {
-        return Objects.hash(getStart(), getEnd(), getType());
-    }
-
-    /**
-     * Checks if the specified span is equal to the current span.
-     */
-    @Override
-    public boolean equals(Object o) {
-        if (o == this) {
-            return true;
-        }
-
-        if (o instanceof Span) {
-            Span s = (Span) o;
-
-            return getStart() == s.getStart() && getEnd() == s.getEnd() && Objects.equals(getType(), s.getType());
-        }
-
-        return false;
-    }
-
-    /**
-     * Generates a human readable string.
-     */
-    @Override
-    public String toString() {
-        StringBuilder toStringBuffer = new StringBuilder(15);
-        toStringBuffer.append("[");
-        toStringBuffer.append(getStart());
-        toStringBuffer.append("..");
-        toStringBuffer.append(getEnd());
-        toStringBuffer.append(")");
-        if (getType() != null) {
-            toStringBuffer.append(" ");
-            toStringBuffer.append(getType());
-        }
-
-        return toStringBuffer.toString();
-    }
-
-}

diff --git a/src/main/jflex/de/ids_mannheim/korap/tokenizer/KorAPTokenizerImpl.jflex b/src/main/jflex/de/ids_mannheim/korap/tokenizer/KorAPTokenizerImpl.jflex
index 584f0cb..ca51178 100644
--- a/src/main/jflex/de/ids_mannheim/korap/tokenizer/KorAPTokenizerImpl.jflex
+++ b/src/main/jflex/de/ids_mannheim/korap/tokenizer/KorAPTokenizerImpl.jflex

@@ -56,7 +56,7 @@
 import java.lang.StringBuffer;
 import java.util.ArrayList;
 import java.util.List;
-// import Span;
+import opennlp.tools.util.Span;
 %%
 
 /**
@@ -67,7 +67,7 @@
 %class KorAPTokenizerImpl
 %unicode
 %public
-// %implements opennlp.tools.tokenize.Tokenizer
+%implements opennlp.tools.tokenize.Tokenizer
 %type Span
 %function getNextToken
 %char

diff --git a/target/generated-sources/jflex/de/ids_mannheim/korap/tokenizer/KorAPTokenizerImpl.java b/target/generated-sources/jflex/de/ids_mannheim/korap/tokenizer/KorAPTokenizerImpl.java
index 71726c8..5711417 100644
--- a/target/generated-sources/jflex/de/ids_mannheim/korap/tokenizer/KorAPTokenizerImpl.java
+++ b/target/generated-sources/jflex/de/ids_mannheim/korap/tokenizer/KorAPTokenizerImpl.java

@@ -60,11 +60,11 @@
 import java.lang.StringBuffer;
 import java.util.ArrayList;
 import java.util.List;
-// import Span;
+import opennlp.tools.util.Span;
 
 // See https://github.com/jflex-de/jflex/issues/222
 @SuppressWarnings("FallThrough")
-public class KorAPTokenizerImpl {
+public class KorAPTokenizerImpl implements opennlp.tools.tokenize.Tokenizer {
 
   /** This character denotes the end of file. */
   public static final int YYEOF = -1;
commit	81925096b645bf17cf688d1311bce002b5a9418f	[log] [tgz]
author	Marc Kupietz <kupietz@ids-mannheim.de>	Mon Sep 07 15:16:51 2020 +0200
committer	Marc K <morckx@gmail.com>	Mon Sep 07 15:25:28 2020 +0200
tree	72e8930b1290ff3567cd9987d7e425d9b5e4c580
parent	478632eb7e133969531f3b401576564783183707 [diff]