Use original Span class and implement Tokenizer interface from OpenNLP Using maven assembly plugin to include the required classes into a jar for binary distribution. This makes it possible to use KorAPTokenizerImpl as tokenizer in OpenNLP applications. Change-Id: I627dcdd046334db1b1ae9dd60cf7104fd3a3c518

commit: 81925096b645bf17cf688d1311bce002b5a9418f [log] [tgz]
author: Marc Kupietz <kupietz@ids-mannheim.de> Mon Sep 07 15:16:51 2020 +0200
committer: Marc K <morckx@gmail.com> Mon Sep 07 15:25:28 2020 +0200
tree: 72e8930b1290ff3567cd9987d7e425d9b5e4c580
parent: 478632eb7e133969531f3b401576564783183707 [diff]
diff --git a/src/assembly/bin-distribution.xml b/src/assembly/bin-distribution.xml
new file mode 100644
index 0000000..5b4aca9
--- /dev/null
+++ b/src/assembly/bin-distribution.xml

@@ -0,0 +1,40 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<assembly xmlns="http://maven.apache.org/ASSEMBLY/2.1.0"
+          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+          xsi:schemaLocation="http://maven.apache.org/ASSEMBLY/2.1.0 http://maven.apache.org/xsd/assembly-2.1.0.xsd">
+    <id>jar-with-dependencies-and-exclude-classes</id>
+    <formats>
+        <format>jar</format>
+    </formats>
+    <includeBaseDirectory>false</includeBaseDirectory>
+    <dependencySets>
+        <dependencySet>
+            <outputDirectory>/</outputDirectory>
+            <useProjectArtifact>false</useProjectArtifact>
+            <unpack>true</unpack>
+            <unpackOptions>
+                <includes>
+                    <include>opennlp/tools/util/Span.class</include>
+                    <include>opennlp/tools/tokenize/Tokenizer.class</include>
+                </includes>
+            </unpackOptions>
+            <scope>runtime</scope>
+        </dependencySet>
+    </dependencySets>
+    <fileSets>
+        <fileSet>
+            <directory>${project.basedir}</directory>
+            <outputDirectory>META-INF/</outputDirectory>
+            <includes>
+                <include>README*</include>
+                <include>Readme*</include>
+                <include>LICENSE*</include>
+            </includes>
+            <useDefaultExcludes>true</useDefaultExcludes>
+        </fileSet>
+        <fileSet>
+            <outputDirectory>/</outputDirectory>
+            <directory>${project.build.outputDirectory}</directory>
+        </fileSet>
+    </fileSets>
+</assembly>
\ No newline at end of file

diff --git a/src/assembly/src-distribution.xml b/src/assembly/src-distribution.xml
new file mode 100644
index 0000000..675c51e
--- /dev/null
+++ b/src/assembly/src-distribution.xml

@@ -0,0 +1,26 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<assembly xmlns="http://maven.apache.org/ASSEMBLY/2.1.0"
+          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+          xsi:schemaLocation="http://maven.apache.org/ASSEMBLY/2.1.0 http://maven.apache.org/xsd/assembly-2.1.0.xsd">
+    <id>src</id>
+    <formats>
+        <format>zip</format>
+    </formats>
+    <fileSets>
+        <fileSet>
+            <directory>${project.basedir}</directory>
+            <includes>
+                <include>README*</include>
+                <include>Readme*</include>
+                <include>LICENSE*</include>
+                <include>NOTICE*</include>
+                <include>pom.xml</include>
+            </includes>
+            <useDefaultExcludes>true</useDefaultExcludes>
+        </fileSet>
+        <fileSet>
+            <directory>${project.build.sourceDirectory}/src</directory>
+            <useDefaultExcludes>true</useDefaultExcludes>
+        </fileSet>
+    </fileSets>
+</assembly>

diff --git a/src/main/java/de/ids_mannheim/korap/tokenizer/NOTICE b/src/main/java/de/ids_mannheim/korap/tokenizer/NOTICE
deleted file mode 100644
index 2ef39b3..0000000
--- a/src/main/java/de/ids_mannheim/korap/tokenizer/NOTICE
+++ /dev/null

@@ -1,2 +0,0 @@
-The Span class contains code from OpenNLP developed at
-The Apache Software Foundation (http://www.apache.org/).

diff --git a/src/main/java/de/ids_mannheim/korap/tokenizer/Span.java b/src/main/java/de/ids_mannheim/korap/tokenizer/Span.java
deleted file mode 100644
index 7d9429d..0000000
--- a/src/main/java/de/ids_mannheim/korap/tokenizer/Span.java
+++ /dev/null

@@ -1,179 +0,0 @@
-package de.ids_mannheim.korap.tokenizer;
-
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-
-import java.io.Serializable;
-import java.util.Objects;
-
-
-/**
- * Class for storing start and end integer offsets.
- *
- */
-public class Span implements Comparable<Span>, Serializable {
-
-    private final int start;
-    private final int end;
-    private final double prob;//default is 0
-    private final String type;
-
-    /**
-     * Initializes a new Span Object. Sets the prob to 0 as default.
-     *
-     * @param s start of span.
-     * @param e end of span, which is +1 more than the last element in the span.
-     * @param type the type of the span
-     */
-    public Span(int s, int e, String type) {
-        this(s, e, type, 0d);
-    }
-
-    /**
-     * Initializes a new Span Object.
-     *
-     * @param s start of span.
-     * @param e end of span, which is +1 more than the last element in the span.
-     * @param type the type of the span
-     * @param prob probability of span.
-     */
-    public Span(int s, int e, String type, double prob) {
-
-        if (s < 0) {
-            throw new IllegalArgumentException("start index must be zero or greater: " + s);
-        }
-        if (e < 0) {
-            throw new IllegalArgumentException("end index must be zero or greater: " + e);
-        }
-        if (s > e) {
-            throw new IllegalArgumentException(
-                    "start index must not be larger than end index: " + "start=" + s + ", end=" + e);
-        }
-
-        start = s;
-        end = e;
-        this.prob = prob;
-        this.type = type;
-    }
-
-    /**
-     * Return the start of a span.
-     *
-     * @return the start of a span.
-     *
-     */
-    public int getStart() {
-        return start;
-    }
-
-    /**
-     * Return the end of a span.
-     *
-     * Note: that the returned index is one past the actual end of the span in the
-     * text, or the first element past the end of the span.
-     *
-     * @return the end of a span.
-     *
-     */
-    public int getEnd() {
-        return end;
-    }
-
-    /**
-     * Retrieves the type of the span.
-     *
-     * @return the type or null if not set
-     */
-    public String getType() {
-        return type;
-    }
-
-    /**
-     * Compares the specified span to the current span.
-     */
-    public int compareTo(Span s) {
-        if (getStart() < s.getStart()) {
-            return -1;
-        } else if (getStart() == s.getStart()) {
-            if (getEnd() > s.getEnd()) {
-                return -1;
-            } else if (getEnd() < s.getEnd()) {
-                return 1;
-            } else {
-                // compare the type
-                if (getType() == null && s.getType() == null) {
-                    return 0;
-                } else if (getType() != null && s.getType() != null) {
-                    // use type lexicography order
-                    return getType().compareTo(s.getType());
-                } else if (getType() != null) {
-                    return -1;
-                }
-                return 1;
-            }
-        } else {
-            return 1;
-        }
-    }
-
-    /**
-     * Generates a hash code of the current span.
-     */
-    @Override
-    public int hashCode() {
-        return Objects.hash(getStart(), getEnd(), getType());
-    }
-
-    /**
-     * Checks if the specified span is equal to the current span.
-     */
-    @Override
-    public boolean equals(Object o) {
-        if (o == this) {
-            return true;
-        }
-
-        if (o instanceof Span) {
-            Span s = (Span) o;
-
-            return getStart() == s.getStart() && getEnd() == s.getEnd() && Objects.equals(getType(), s.getType());
-        }
-
-        return false;
-    }
-
-    /**
-     * Generates a human readable string.
-     */
-    @Override
-    public String toString() {
-        StringBuilder toStringBuffer = new StringBuilder(15);
-        toStringBuffer.append("[");
-        toStringBuffer.append(getStart());
-        toStringBuffer.append("..");
-        toStringBuffer.append(getEnd());
-        toStringBuffer.append(")");
-        if (getType() != null) {
-            toStringBuffer.append(" ");
-            toStringBuffer.append(getType());
-        }
-
-        return toStringBuffer.toString();
-    }
-
-}

diff --git a/src/main/jflex/de/ids_mannheim/korap/tokenizer/KorAPTokenizerImpl.jflex b/src/main/jflex/de/ids_mannheim/korap/tokenizer/KorAPTokenizerImpl.jflex
index 584f0cb..ca51178 100644
--- a/src/main/jflex/de/ids_mannheim/korap/tokenizer/KorAPTokenizerImpl.jflex
+++ b/src/main/jflex/de/ids_mannheim/korap/tokenizer/KorAPTokenizerImpl.jflex

@@ -56,7 +56,7 @@
 import java.lang.StringBuffer;
 import java.util.ArrayList;
 import java.util.List;
-// import Span;
+import opennlp.tools.util.Span;
 %%
 
 /**
@@ -67,7 +67,7 @@
 %class KorAPTokenizerImpl
 %unicode
 %public
-// %implements opennlp.tools.tokenize.Tokenizer
+%implements opennlp.tools.tokenize.Tokenizer
 %type Span
 %function getNextToken
 %char
commit	81925096b645bf17cf688d1311bce002b5a9418f	[log] [tgz]
author	Marc Kupietz <kupietz@ids-mannheim.de>	Mon Sep 07 15:16:51 2020 +0200
committer	Marc K <morckx@gmail.com>	Mon Sep 07 15:25:28 2020 +0200
tree	72e8930b1290ff3567cd9987d7e425d9b5e4c580
parent	478632eb7e133969531f3b401576564783183707 [diff]