Make tokenizer implementation exchangeable
Allows for quite an easy adaptation and integration of other language
specializations and other tokenizers, also from the command line with
new -T option that allows to specify the tokenizer class name.
TODO: add documentation to the KorapTokenizer interface
Change-Id: Ib95793f47887a3b0829d68ebdbf8bc40815a0605
diff --git a/.gitignore b/.gitignore
index b109df0..b5ddfd2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,7 +1,7 @@
.*
!/.gitignore
target
-!/target/generated-sources/jflex/de/ids_mannheim/korap/tokenizer/KorAPTokenizerImpl.java
+!/target/generated-sources/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.java
tmp
logs
cache_store
diff --git a/Readme.md b/Readme.md
index 285866b..31a1a63 100644
--- a/Readme.md
+++ b/Readme.md
@@ -1,13 +1,18 @@
# KorAP Tokenizer
-Efficient, [OpenNLP tools](https://opennlp.apache.org) compatible DFA tokenizer and sentence splitter with character offset output based on [JFlex](https://www.jflex.de/), suitable for German and other European languages.
+Interface and implementation of a tokenizer and sentence splitter that can be used
-## Description
-The KorAP tokenizer is used for the German Reference Corpus DeReKo. Being based on a finite state automaton,
+* as standalone tokenizer and/or sentence splitter
+* within the KorAP ingestion pipeline
+* within the [OpenNLP tools](https://opennlp.apache.org) framework
+
+## DeReKo Tokenizer (included default implementation)
+The included default implementation (`DerekoDfaTokenizer`) is a highly efficient DFA tokenizer and sentence splitter with character offset output based on [JFlex](https://www.jflex.de/), suitable for German and other European languages.
+It is used for the German Reference Corpus DeReKo. Being based on a finite state automaton,
it is not accurate as language model based tokenizers, but with ~5 billion words per hour typically more efficient.
An important feature in the DeReKo/KorAP context is also, that it reliably reports the character offsets of the tokens
so that this information can be used for applying standoff annotations.
-The main class `KorAPTokenizerImpl` implements the [`opennlp.tools.tokenize.Tokenizer`](https://opennlp.apache.org/docs/1.8.2/apidocs/opennlp-tools/opennlp/tools/tokenize/Tokenizer.html)
+`DerekoDfaTokenizer` and any implementation of the `KorapTokenizer` interface also implement the [`opennlp.tools.tokenize.Tokenizer`](https://opennlp.apache.org/docs/1.8.2/apidocs/opennlp-tools/opennlp/tools/tokenize/Tokenizer.html)
and [`opennlp.tools.sentdetect.SentenceDetector`](https://opennlp.apache.org/docs/1.8.2/apidocs/opennlp-tools/opennlp/tools/sentdetect/SentenceDetector.html)
interfaces and can thus be used as a drop-in replacement in OpenNLP applications.
@@ -16,6 +21,10 @@
Our changes mainly concern a good coverage of German abbreviations,
and some updates for handling computer mediated communication, optimized and tested against the gold data from the [EmpiriST 2015](https://sites.google.com/site/empirist2015/) shared task (Beißwenger et al. 2016).
+### Adaptations for other Languages
+To adapt the included implementation to another language you will probably want to start with replacing the abbreviations
+pattern defined in `SEABBR` in the jflex source.
+
## Installation
```shell script
$ MAVEN_OPTS="-Xss50m" mvn clean install
diff --git a/pom.xml b/pom.xml
index 9aca3ed..f133064 100644
--- a/pom.xml
+++ b/pom.xml
@@ -117,7 +117,7 @@
<manifest>
<addClasspath>true</addClasspath>
<classpathPrefix>lib/</classpathPrefix>
- <mainClass>de.ids_mannheim.korap.tokenizer.KorAPTokenizer</mainClass>
+ <mainClass>de.ids_mannheim.korap.tokenizer.Main</mainClass>
</manifest>
</archive>
</configuration>
@@ -148,7 +148,7 @@
</descriptors>
<archive>
<manifest>
- <mainClass>de.ids_mannheim.korap.tokenizer.KorAPTokenizer</mainClass>
+ <mainClass>de.ids_mannheim.korap.tokenizer.Main</mainClass>
</manifest>
</archive>
</configuration>
@@ -199,7 +199,7 @@
</binFileExtensions>
<programs>
<program>
- <mainClass>de.ids_mannheim.korap.tokenizer.KorAPTokenizer</mainClass>
+ <mainClass>de.ids_mannheim.korap.tokenizer.Main</mainClass>
<id>koraptokenizer</id>
</program>
</programs>
@@ -244,5 +244,11 @@
<artifactId>appassembler-maven-plugin</artifactId>
<version>2.1.0</version>
</dependency>
+ <dependency>
+ <groupId>org.jetbrains</groupId>
+ <artifactId>annotations</artifactId>
+ <version>RELEASE</version>
+ <scope>compile</scope>
+ </dependency>
</dependencies>
</project>
diff --git a/src/main/java/de/ids_mannheim/korap/tokenizer/KorapTokenizer.java b/src/main/java/de/ids_mannheim/korap/tokenizer/KorapTokenizer.java
new file mode 100644
index 0000000..cb8f280
--- /dev/null
+++ b/src/main/java/de/ids_mannheim/korap/tokenizer/KorapTokenizer.java
@@ -0,0 +1,103 @@
+package de.ids_mannheim.korap.tokenizer;
+
+import opennlp.tools.util.Span;
+
+import java.io.IOException;
+import java.io.PrintStream;
+import java.io.Reader;
+
+public interface KorapTokenizer extends opennlp.tools.tokenize.Tokenizer, opennlp.tools.sentdetect.SentenceDetector {
+ void scan() throws IOException;
+
+ /**
+ * Mainly targeted language(s)
+ * @return list of ISO 639 alpha-2 or alpha-3 language codes
+ * @apiNote will later be used to find appropriate implementations via reflection
+ */
+ CharSequence[] getTargetLanguages();
+
+ void setInputReader(Reader inputReader);
+
+ void setSplitSentences(boolean splitSentences);
+
+ void setEcho(boolean echo);
+
+ void setPrintOffsets(boolean printOffsets);
+
+ void setPrintTokens(boolean tokenize);
+
+ void setOutputStream(PrintStream outputStream);
+
+ void setNormalize(boolean normalize);
+
+ String[] tokenize(String s);
+
+ Span[] tokenizePos(String s);
+
+ String[] sentDetect(String s);
+
+ Span[] sentPosDetect(String s);
+
+ class Builder {
+ private boolean splitSentences;
+ private boolean echo;
+ private boolean printOffsets;
+ private boolean printTokens;
+ private PrintStream outputStream = System.out;
+ private boolean normalize;
+ private Class tokenizerClass;
+ private Reader inputReader;
+
+ public Builder tokenizerClassName(String tokenizerClassName) throws ClassNotFoundException {
+ this.tokenizerClass = Class.forName(tokenizerClassName);
+ return this;
+ }
+
+ public Builder splitSentences(boolean splitSentences) {
+ this.splitSentences = splitSentences;
+ return this;
+ }
+
+ public Builder setEcho(boolean echo) {
+ this.echo = echo;
+ return this;
+ }
+
+ public Builder printOffsets(boolean printOffsets) {
+ this.printOffsets = printOffsets;
+ return this;
+ }
+
+ public Builder printTokens(boolean printTokens) {
+ this.printTokens = printTokens;
+ return this;
+ }
+
+ public Builder inputReader(Reader inputReader) {
+ this.inputReader = inputReader;
+ return this;
+ }
+
+ public Builder normalize(boolean normalize) {
+ this.normalize = normalize;
+ return this;
+ }
+
+ public Builder outputStream(PrintStream outputStream) {
+ this.outputStream = outputStream;
+ return this;
+ }
+
+ public KorapTokenizer build() throws IllegalAccessException, InstantiationException {
+ KorapTokenizer korapTokenizer = (KorapTokenizer) tokenizerClass.newInstance();
+ korapTokenizer.setEcho(echo);
+ korapTokenizer.setInputReader(inputReader);
+ korapTokenizer.setOutputStream(outputStream);
+ korapTokenizer.setNormalize(normalize);
+ korapTokenizer.setPrintOffsets(printOffsets);
+ korapTokenizer.setSplitSentences(splitSentences);
+ korapTokenizer.setPrintTokens(printTokens);
+ return korapTokenizer;
+ }
+ }
+}
diff --git a/src/main/java/de/ids_mannheim/korap/tokenizer/KorAPTokenizer.java b/src/main/java/de/ids_mannheim/korap/tokenizer/Main.java
similarity index 78%
rename from src/main/java/de/ids_mannheim/korap/tokenizer/KorAPTokenizer.java
rename to src/main/java/de/ids_mannheim/korap/tokenizer/Main.java
index c754917..bf3c2ff 100644
--- a/src/main/java/de/ids_mannheim/korap/tokenizer/KorAPTokenizer.java
+++ b/src/main/java/de/ids_mannheim/korap/tokenizer/Main.java
@@ -8,7 +8,10 @@
@CommandLine.Command(mixinStandardHelpOptions = true,
name = "koraptokenizer", version = "{}", description = "Tokenizes (and sentence splits) text input.")
-public class KorAPTokenizer implements Callable<Integer> {
+public class Main implements Callable<Integer> {
+
+ @CommandLine.Option(names = {"-T", "--tokenizer-class"}, description = "Class name of the actual tokenizer that will be used (default: ${DEFAULT-VALUE})")
+ String tokenizerClassName = DerekoDfaTokenizer.class.getName();
@CommandLine.Option(names = {"--no-tokens"}, negatable = true, description = "Print tokens (default: ${DEFAULT-VALUE})")
boolean tokens = true;
@@ -38,12 +41,12 @@
@CommandLine.Parameters(arity = "0..*", paramLabel = "FILES", description = "input files")
private final ArrayList<String> inputFiles = new ArrayList<>();
- public KorAPTokenizer() {
+ public Main() {
}
public static void main(String[] args) {
- new CommandLine(new KorAPTokenizer()).execute(args);
+ new CommandLine(new Main()).execute(args);
}
@Override
@@ -57,13 +60,21 @@
}
for (int i = 0; i < inputFiles.size() || (i == 0 && inputFiles.size() == 0); i++) {
- KorAPDFATokenizer scanner = null;
String fn = (inputFiles.size() > 0 ? inputFiles.get(i) : "-");
try {
BufferedReader br = "-".equals(fn) ? new BufferedReader(new InputStreamReader(System.in)) :
new BufferedReader(new FileReader(fn));
- scanner = new KorAPDFATokenizer(br, output_stream, true, tokens, sentencize, positions, ktt, normalize);
- scanner.scanThrough();
+ new KorapTokenizer.Builder()
+ .tokenizerClassName(tokenizerClassName)
+ .inputReader(br)
+ .outputStream(output_stream)
+ .printTokens(tokens)
+ .printOffsets(positions)
+ .normalize(normalize)
+ .splitSentences(sentencize)
+ .setEcho(true)
+ .build()
+ .scan();
} catch (FileNotFoundException e) {
System.err.println("File not found : \"" + fn + "\"");
} catch (IOException e) {
diff --git a/src/main/jflex/de/ids_mannheim/korap/tokenizer/KorAPDFATokenizer.jflex b/src/main/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.jflex
similarity index 97%
rename from src/main/jflex/de/ids_mannheim/korap/tokenizer/KorAPDFATokenizer.jflex
rename to src/main/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.jflex
index a79d44b..6c3db85 100644
--- a/src/main/jflex/de/ids_mannheim/korap/tokenizer/KorAPDFATokenizer.jflex
+++ b/src/main/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.jflex
@@ -61,15 +61,16 @@
* ... which is ...
* Based on Lucene's StandardTokenizerImpl, but heavily modified.
*/
-%class KorAPDFATokenizer
+%class DerekoDfaTokenizer
%unicode
%public
-%implements opennlp.tools.tokenize.Tokenizer, opennlp.tools.sentdetect.SentenceDetector
+%implements KorapTokenizer, opennlp.tools.tokenize.Tokenizer, opennlp.tools.sentdetect.SentenceDetector
%type Span
%function getNextToken
%char
%{
+ private static final CharSequence[] targetLanguages = {"de"};
private boolean xmlEcho = false;
private boolean normalize = false;
private boolean debug = false;
@@ -78,36 +79,65 @@
private long previousFileEndOffset = -1;
private int tokenId = 0;
private boolean atEOT = false;
- private boolean sentencize = false;
+ private boolean splitSentences = false;
private boolean echo = false;
- private boolean positions = false;
- private boolean tokens = false;
+ private boolean printOffsets = false;
+ private boolean printTokens = false;
private PrintStream outputStream = System.out;
- public KorAPDFATokenizer() {
+ @Override
+ public CharSequence[] getTargetLanguages() {
+ return targetLanguages;
+ }
+
+ public DerekoDfaTokenizer() {
this.zzReader = null;
}
- public KorAPDFATokenizer(java.io.Reader in, PrintStream outputStream, boolean echo, boolean tokens, boolean sentencize, boolean positions, boolean xmlEcho, boolean normalize) {
- this.zzReader = in;
- if (outputStream != null)
- this.outputStream = outputStream;
- this.tokens = tokens;
- this.sentencize = sentencize;
- this.positions = positions;
+ @Override
+ public void setInputReader(Reader inputReader) {
+ this.zzReader = inputReader;
+ }
+
+ @Override
+ public void setSplitSentences(boolean splitSentences) {
+ this.splitSentences = splitSentences;
+ }
+
+ @Override
+ public void setEcho(boolean echo) {
this.echo = echo;
- this.xmlEcho = xmlEcho;
+ }
+
+ @Override
+ public void setPrintOffsets(boolean printOffsets) {
+ this.printOffsets = printOffsets;
+ }
+
+ @Override
+ public void setPrintTokens(boolean printTokens) {
+ this.printTokens = printTokens;
+ }
+
+ @Override
+ public void setOutputStream(PrintStream outputStream) {
+ this.outputStream = outputStream;
+ }
+
+ @Override
+ public void setNormalize(boolean normalize) {
this.normalize = normalize;
}
- public void scanThrough() throws IOException {
+ @Override
+ public void scan() throws IOException {
List<Span> list = new ArrayList<Span>();
Span token;
while (!zzAtEOF) {
token = this.getNextToken();
if (atEOT) {
if (echo) {
- printTokenPositions(list, sentencize);
+ printTokenPositions(list, splitSentences);
list.clear();
}
atEOT = false;
@@ -118,6 +148,7 @@
}
}
+ @Override
public String[] tokenize(String s) {
Span[] spans;
int i;
@@ -131,7 +162,7 @@
return tokens;
}
- public void printTokenPositions(List<Span> spanList, boolean sentencize) {
+ void printTokenPositions(List<Span> spanList, boolean sentencize) {
int sentenceStart = -1;
StringBuilder tokenStringBuffer = new StringBuilder();
StringBuilder sentenceStringBuffer = new StringBuilder();
@@ -139,7 +170,7 @@
Span s = spanList.get(i);
if (sentenceStart == -1)
sentenceStart = s.getStart();
- if (positions) {
+ if (printOffsets) {
tokenStringBuffer.append(s.getStart())
.append(" ")
.append(s.getEnd());
@@ -160,6 +191,7 @@
outputStream.println(sentenceStringBuffer.toString());
}
+ @Override
public Span[] tokenizePos(String s) {
Span token;
int i = 0;
@@ -171,7 +203,7 @@
token = this.getNextToken();
if (atEOT) {
if (echo) {
- printTokenPositions(list, sentencize);
+ printTokenPositions(list, splitSentences);
list.clear();
}
atEOT = false;
@@ -187,6 +219,7 @@
return (list.toArray(new Span[list.size()]));
}
+ @Override
public String[] sentDetect(String s) {
Span[] spans;
int i;
@@ -200,6 +233,7 @@
return sentences;
}
+ @Override
public Span[] sentPosDetect(String s) {
final Span tokens[] = tokenizePos(s);
ArrayList<Span> sentences = new ArrayList<Span>();
@@ -247,7 +281,7 @@
to = (yychar - startOffset + yylength() - lengthDiff);
if (xmlEcho) {
outputStream.println("<span id=\"t_" + tokenId + "\" from=\"" + from + "\" to=\"" + to + "\"/>\n" + value);
- } else if (echo && tokens) {
+ } else if (echo && printTokens) {
outputStream.println(value);
}
startOffset += lengthDiff;
diff --git a/src/test/java/de/ids_mannheim/korap/tokenizer/IPCOffsetTests.java b/src/test/java/de/ids_mannheim/korap/tokenizer/IPCOffsetTests.java
index bf5743c..611549f 100644
--- a/src/test/java/de/ids_mannheim/korap/tokenizer/IPCOffsetTests.java
+++ b/src/test/java/de/ids_mannheim/korap/tokenizer/IPCOffsetTests.java
@@ -47,7 +47,7 @@
public void testMainWithOffsetsAndSentencesOnDifferentInputFiles() throws IOException {
File tempFile = File.createTempFile("tokenoutput", ".txt");
String[] args = {"--no-tokens", "--positions", "--sentence-boundaries", "--force", "-o", tempFile.getAbsolutePath(), input};
- KorAPTokenizer.main(args);
+ Main.main(args);
String actualResult = readFile(tempFile.getAbsolutePath());
String goldData = readFile(gold);
assertEquals(goldData, actualResult);
diff --git a/src/test/java/de/ids_mannheim/korap/tokenizer/SentenceSplitterTest.java b/src/test/java/de/ids_mannheim/korap/tokenizer/SentenceSplitterTest.java
index 59a181a..bed83cb 100644
--- a/src/test/java/de/ids_mannheim/korap/tokenizer/SentenceSplitterTest.java
+++ b/src/test/java/de/ids_mannheim/korap/tokenizer/SentenceSplitterTest.java
@@ -12,21 +12,21 @@
@Test
public void testSentSplitterSimple () {
- KorAPDFATokenizer tok = new KorAPDFATokenizer();
+ DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
String[] sentences = tok.sentDetect("Der alte Mann.");
assertEquals(sentences.length, 1);
}
@Test
public void testSentSplitterAbbr () {
- KorAPDFATokenizer tok = new KorAPDFATokenizer();
+ DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
String[] sentences = tok.sentDetect("Der Vorsitzende der Abk. hat gewählt.");
assertEquals(sentences.length, 1);
}
@Test
public void testSentSplitterHost1 () {
- KorAPDFATokenizer tok = new KorAPDFATokenizer();
+ DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
String[] sentences = tok.sentDetect("Gefunden auf wikipedia.org.");
assertEquals(sentences.length, 1);
}
@@ -34,14 +34,14 @@
@Test
@Ignore
public void testSentSplitterHost2 () {
- KorAPDFATokenizer tok = new KorAPDFATokenizer();
+ DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
String[] sentences = tok.sentDetect("Gefunden auf www.wikipedia.org");
assertEquals(sentences.length, 1);
}
@Test
public void testSentSplitterEmail1 () {
- KorAPDFATokenizer tok = new KorAPDFATokenizer();
+ DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
String[] sentences = tok.sentDetect("Ich bin unter korap@ids-mannheim.de erreichbar.");
assertEquals(sentences.length, 1);
}
@@ -49,28 +49,28 @@
@Test
public void testSentSplitterWeb1 () {
- KorAPDFATokenizer tok = new KorAPDFATokenizer();
+ DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
String[] sentences = tok.sentDetect("Unsere Website ist https://korap.ids-mannheim.de/?q=Baum");
assertEquals(sentences.length, 1);
}
@Test
public void testSentSplitterServer () {
- KorAPDFATokenizer tok = new KorAPDFATokenizer();
+ DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
String[] sentences = tok.sentDetect("Unser Server ist 10.0.10.51.");
assertEquals(sentences.length, 1);
}
@Test
public void testSentSplitterNum () {
- KorAPDFATokenizer tok = new KorAPDFATokenizer();
+ DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
String[] sentences = tok.sentDetect("Zu 50.4% ist es sicher");
assertEquals(sentences.length, 1);
}
@Test
public void testSentSplitterDate () {
- KorAPDFATokenizer tok = new KorAPDFATokenizer();
+ DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
String[] sentences = tok.sentDetect("Der Termin ist am 5.9.2018");
assertEquals(sentences.length, 1);
}
@@ -78,14 +78,14 @@
@Test
// Probably interpreted as HOST
public void testSentSplitterFileExtension1 () {
- KorAPDFATokenizer tok = new KorAPDFATokenizer();
+ DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
String[] sentences = tok.sentDetect("Ich habe die readme.txt heruntergeladen");
assertEquals(sentences.length, 1);
}
@Test
public void testSentMultiMarker () {
- KorAPDFATokenizer tok = new KorAPDFATokenizer();
+ DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
String[] sentences = tok.sentDetect("Ausschalten!!! Hast Du nicht gehört???");
assertEquals("Ausschalten!!!", sentences[0]);
assertEquals("Hast Du nicht gehört???", sentences[1]);
@@ -95,7 +95,7 @@
@Test
@Ignore
public void testSentSplitterQuote () {
- KorAPDFATokenizer tok = new KorAPDFATokenizer();
+ DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
String[] sentences = tok.sentDetect("\"Ausschalten!!!\", sagte er. \"Hast Du nicht gehört???\"");
assertEquals("\"Ausschalten!!!\", sagte er.", sentences[0]);
assertEquals("\"Hast Du nicht gehört???\"", sentences[1]);
diff --git a/src/test/java/de/ids_mannheim/korap/tokenizer/TokenizerCoverTest.java b/src/test/java/de/ids_mannheim/korap/tokenizer/TokenizerCoverTest.java
index a2da7fb..fd60d6f 100644
--- a/src/test/java/de/ids_mannheim/korap/tokenizer/TokenizerCoverTest.java
+++ b/src/test/java/de/ids_mannheim/korap/tokenizer/TokenizerCoverTest.java
@@ -116,7 +116,7 @@
* with the gold standard and return the sum of
* levenshtein distances.
*/
- public int distanceToGoldStandard (KorAPDFATokenizer tok, String suite, String postings) {
+ public int distanceToGoldStandard (DerekoDfaTokenizer tok, String suite, String postings) {
// Load raw postings
EmpiristScanner esRaw = new EmpiristScanner(
@@ -159,7 +159,7 @@
public void testTokenizerCoverEmpiristCmc () {
// Create tokenizer object
- KorAPDFATokenizer tok = new KorAPDFATokenizer();
+ DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
String test = "cmc_test_blog_comment";
int dist = distanceToGoldStandard(tok, "test_cmc", test);
@@ -191,7 +191,7 @@
public void testTokenizerCoverEmpiristWeb () {
// Create tokenizer object
- KorAPDFATokenizer tok = new KorAPDFATokenizer();
+ DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
String test = "web_test_001";
int dist = distanceToGoldStandard(tok, "test_web", test);
diff --git a/src/test/java/de/ids_mannheim/korap/tokenizer/TokenizerTest.java b/src/test/java/de/ids_mannheim/korap/tokenizer/TokenizerTest.java
index bbe8080..697c65c 100644
--- a/src/test/java/de/ids_mannheim/korap/tokenizer/TokenizerTest.java
+++ b/src/test/java/de/ids_mannheim/korap/tokenizer/TokenizerTest.java
@@ -16,7 +16,7 @@
@Test
public void testTokenizerSimple () {
- KorAPDFATokenizer tok = new KorAPDFATokenizer();
+ DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
String[] tokens = tok.tokenize("Der alte Mann");
assertEquals(tokens[0], "Der");
assertEquals(tokens[1], "alte");
@@ -34,7 +34,7 @@
@Test
@Ignore
public void testTokenizerAbbr () {
- KorAPDFATokenizer tok = new KorAPDFATokenizer();
+ DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
String[] tokens = tok.tokenize("Der Vorsitzende der F.D.P. hat gewählt");
assertEquals(tokens[0], "Der");
assertEquals(tokens[1], "Vorsitzende");
@@ -47,7 +47,7 @@
@Test
public void testTokenizerHost1 () {
- KorAPDFATokenizer tok = new KorAPDFATokenizer();
+ DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
String[] tokens = tok.tokenize("Gefunden auf wikipedia.org");
assertEquals(tokens[0], "Gefunden");
assertEquals(tokens[1], "auf");
@@ -58,7 +58,7 @@
@Test
@Ignore
public void testTokenizerHost2 () {
- KorAPDFATokenizer tok = new KorAPDFATokenizer();
+ DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
String[] tokens = tok.tokenize("Gefunden auf www.wikipedia.org");
assertEquals(tokens[0], "Gefunden");
assertEquals(tokens[1], "auf");
@@ -68,7 +68,7 @@
@Test
public void testTokenizerDash () {
- KorAPDFATokenizer tok = new KorAPDFATokenizer();
+ DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
String[] tokens = tok.tokenize("Das war -- spitze");
assertEquals(tokens[0], "Das");
assertEquals(tokens[1], "war");
@@ -79,7 +79,7 @@
@Test
public void testTokenizerEmail1 () {
- KorAPDFATokenizer tok = new KorAPDFATokenizer();
+ DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
String[] tokens = tok.tokenize("Ich bin unter korap@ids-mannheim.de erreichbar.");
assertEquals(tokens[0], "Ich");
assertEquals(tokens[1], "bin");
@@ -92,7 +92,7 @@
@Test
public void testTokenizerEmail2 () {
- KorAPDFATokenizer tok = new KorAPDFATokenizer();
+ DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
String[] tokens = tok.tokenize("Oder unter korap[at]ids-mannheim[dot]de.");
assertEquals(tokens[0], "Oder");
assertEquals(tokens[1], "unter");
@@ -104,7 +104,7 @@
@Test
@Ignore
public void testTokenizerEmail3 () {
- KorAPDFATokenizer tok = new KorAPDFATokenizer();
+ DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
String[] tokens = tok.tokenize("Oder unter korap(at)ids-mannheim(dot)de.");
assertEquals(tokens[0], "Oder");
assertEquals(tokens[1], "unter");
@@ -115,7 +115,7 @@
@Test
public void testTokenizerDoNotAcceptQuotedEmailNames () {
- KorAPDFATokenizer tok = new KorAPDFATokenizer();
+ DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
String[] tokens = tok.tokenize("\"John Doe\"@xx.com");
assertEquals("\"", tokens[0]);
assertEquals("John", tokens[1]);
@@ -129,7 +129,7 @@
@Test
public void testTokenizerTwitter () {
- KorAPDFATokenizer tok = new KorAPDFATokenizer();
+ DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
String[] tokens = tok.tokenize("Folgt @korap und #korap");
assertEquals(tokens[0], "Folgt");
assertEquals(tokens[1], "@korap");
@@ -140,7 +140,7 @@
@Test
public void testTokenizerWeb1 () {
- KorAPDFATokenizer tok = new KorAPDFATokenizer();
+ DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
String[] tokens = tok.tokenize("Unsere Website ist https://korap.ids-mannheim.de/?q=Baum");
assertEquals(tokens[0], "Unsere");
assertEquals(tokens[1], "Website");
@@ -152,7 +152,7 @@
@Test
@Ignore
public void testTokenizerWeb2 () {
- KorAPDFATokenizer tok = new KorAPDFATokenizer();
+ DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
String[] tokens = tok.tokenize("Wir sind auch im Internet (https://korap.ids-mannheim.de/?q=Baum)");
assertEquals(tokens[0], "Wir");
assertEquals(tokens[1], "sind");
@@ -168,7 +168,7 @@
@Test
@Ignore
public void testTokenizerWeb3 () {
- KorAPDFATokenizer tok = new KorAPDFATokenizer();
+ DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
String[] tokens = tok.tokenize("Die Adresse ist https://korap.ids-mannheim.de/?q=Baum.");
assertEquals(tokens[0], "Die");
assertEquals(tokens[1], "Adresse");
@@ -180,7 +180,7 @@
@Test
public void testTokenizerServer () {
- KorAPDFATokenizer tok = new KorAPDFATokenizer();
+ DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
String[] tokens = tok.tokenize("Unser Server ist 10.0.10.51.");
assertEquals(tokens[0], "Unser");
assertEquals(tokens[1], "Server");
@@ -192,7 +192,7 @@
@Test
public void testTokenizerNum () {
- KorAPDFATokenizer tok = new KorAPDFATokenizer();
+ DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
String[] tokens = tok.tokenize("Zu 50,4% ist es sicher");
assertEquals(tokens[0], "Zu");
assertEquals(tokens[1], "50,4");
@@ -205,7 +205,7 @@
@Test
public void testTokenizerDate () {
- KorAPDFATokenizer tok = new KorAPDFATokenizer();
+ DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
String[] tokens = tok.tokenize("Der Termin ist am 5.9.2018");
assertEquals(tokens[0], "Der");
assertEquals(tokens[1], "Termin");
@@ -226,7 +226,7 @@
@Test
@Ignore
public void testTokenizerDateRange () {
- KorAPDFATokenizer tok = new KorAPDFATokenizer();
+ DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
String[] tokens = tok.tokenize("Der Termin war vom 4.-5.9.2018");
assertEquals(tokens[0], "Der");
assertEquals(tokens[1], "Termin");
@@ -240,7 +240,7 @@
@Test
public void testTokenizerEmoji1 () {
- KorAPDFATokenizer tok = new KorAPDFATokenizer();
+ DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
String[] tokens = tok.tokenize("Das ist toll! ;)");
assertEquals(tokens[0], "Das");
assertEquals(tokens[1], "ist");
@@ -252,7 +252,7 @@
@Test
public void testTokenizerRef1 () {
- KorAPDFATokenizer tok = new KorAPDFATokenizer();
+ DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
String[] tokens = tok.tokenize("Kupietz und Schmidt (2018): Korpuslinguistik");
assertEquals(tokens[0], "Kupietz");
assertEquals(tokens[1], "und");
@@ -267,7 +267,7 @@
@Test
public void testTokenizerRef2 () {
- KorAPDFATokenizer tok = new KorAPDFATokenizer();
+ DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
String[] tokens = tok.tokenize("Kupietz und Schmidt [2018]: Korpuslinguistik");
assertEquals(tokens[0], "Kupietz");
assertEquals(tokens[1], "und");
@@ -282,7 +282,7 @@
@Test
public void testTokenizerOmission1 () {
- KorAPDFATokenizer tok = new KorAPDFATokenizer();
+ DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
String[] tokens = tok.tokenize("Er ist ein A****loch!");
assertEquals(tokens[0], "Er");
assertEquals(tokens[1], "ist");
@@ -294,7 +294,7 @@
@Test
public void testTokenizerOmission2 () {
- KorAPDFATokenizer tok = new KorAPDFATokenizer();
+ DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
String[] tokens = tok.tokenize("F*ck!");
assertEquals(tokens[0], "F*ck");
assertEquals(tokens[1], "!");
@@ -303,7 +303,7 @@
@Test
public void testTokenizerOmission3 () {
- KorAPDFATokenizer tok = new KorAPDFATokenizer();
+ DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
String[] tokens = tok.tokenize("Dieses verf***** Kleid!");
assertEquals(tokens[0], "Dieses");
assertEquals(tokens[1], "verf*****");
@@ -315,7 +315,7 @@
@Test
// Probably interpreted as HOST
public void testTokenizerFileExtension1 () {
- KorAPDFATokenizer tok = new KorAPDFATokenizer();
+ DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
String[] tokens = tok.tokenize("Ich habe die readme.txt heruntergeladen");
assertEquals(tokens[0], "Ich");
assertEquals(tokens[1], "habe");
@@ -328,7 +328,7 @@
@Test
// Probably interpreted as HOST
public void testTokenizerFileExtension2 () {
- KorAPDFATokenizer tok = new KorAPDFATokenizer();
+ DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
String[] tokens = tok.tokenize("Nimm die README.TXT!");
assertEquals(tokens[0], "Nimm");
assertEquals(tokens[1], "die");
@@ -340,7 +340,7 @@
@Test
// Probably interpreted as HOST
public void testTokenizerFileExtension3 () {
- KorAPDFATokenizer tok = new KorAPDFATokenizer();
+ DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
String[] tokens = tok.tokenize("Zeig mir profile.jpeg");
assertEquals(tokens[0], "Zeig");
assertEquals(tokens[1], "mir");
@@ -350,7 +350,7 @@
@Test
public void testTokenizerFile1 () {
- KorAPDFATokenizer tok = new KorAPDFATokenizer();
+ DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
String[] tokens = tok.tokenize("Zeig mir c:\\Dokumente\\profile.docx");
assertEquals(tokens[0], "Zeig");
assertEquals(tokens[1], "mir");
@@ -360,7 +360,7 @@
@Test
public void testTokenizerFile2 () {
- KorAPDFATokenizer tok = new KorAPDFATokenizer();
+ DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
String[] tokens = tok.tokenize("Gehe zu /Dokumente/profile.docx");
assertEquals(tokens[0], "Gehe");
assertEquals(tokens[1], "zu");
@@ -371,7 +371,7 @@
@Test
@Ignore
public void testTokenizerFile3 () {
- KorAPDFATokenizer tok = new KorAPDFATokenizer();
+ DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
String[] tokens = tok.tokenize("Zeig mir c:\\Dokumente\\profile.jpeg");
assertEquals(tokens[0], "Zeig");
assertEquals(tokens[1], "mir");
@@ -381,7 +381,7 @@
@Test
public void testTokenizerPunct () {
- KorAPDFATokenizer tok = new KorAPDFATokenizer();
+ DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
String[] tokens = tok.tokenize("Er sagte: \"Es geht mir gut!\", daraufhin ging er.");
assertEquals(tokens[0], "Er");
assertEquals(tokens[1], "sagte");
@@ -403,7 +403,7 @@
@Test
public void testTokenizerPlusAmpersand () {
- KorAPDFATokenizer tok = new KorAPDFATokenizer();
+ DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
String[] tokens = tok.tokenize(""Das ist von C&A!"");
assertEquals(tokens[0], """);
assertEquals(tokens[1], "Das");
@@ -417,7 +417,7 @@
@Test
public void testTokenizerLongEnd () {
- KorAPDFATokenizer tok = new KorAPDFATokenizer();
+ DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
String[] tokens = tok.tokenize("Siehst Du?!!?");
assertEquals(tokens[0], "Siehst");
assertEquals(tokens[1], "Du");
@@ -427,7 +427,7 @@
@Test
public void testTokenizerIrishO () {
- KorAPDFATokenizer tok = new KorAPDFATokenizer();
+ DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
String[] tokens = tok.tokenize("Peter O'Toole");
assertEquals(tokens[0], "Peter");
assertEquals(tokens[1], "O'Toole");
@@ -436,7 +436,7 @@
@Test
public void testTokenizerAbr () {
- KorAPDFATokenizer tok = new KorAPDFATokenizer();
+ DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
String[] tokens = tok.tokenize("Früher bzw. später ...");
assertEquals(tokens[0], "Früher");
assertEquals(tokens[1], "bzw.");
@@ -448,7 +448,7 @@
@Test
@Ignore
public void testTokenizerUppercaseRule () {
- KorAPDFATokenizer tok = new KorAPDFATokenizer();
+ DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
String[] tokens = tok.tokenize("Es war spät.Morgen ist es früh.");
assertEquals(tokens[0], "Es");
assertEquals(tokens[1], "war");
@@ -464,7 +464,7 @@
@Test
public void testTokenizerOrd () {
- KorAPDFATokenizer tok = new KorAPDFATokenizer();
+ DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
String[] tokens = tok.tokenize("Sie erreichte den 1. Platz!");
assertEquals(tokens[0], "Sie");
assertEquals(tokens[1], "erreichte");
@@ -477,7 +477,7 @@
@Test
public void testNoZipOuputArchive () {
- KorAPDFATokenizer tok = new KorAPDFATokenizer();
+ DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
String[] tokens = tok.tokenize("Archive: Ich bin kein zip\n");
assertEquals(tokens[0], "Archive");
assertEquals(tokens[1], ":");
@@ -490,7 +490,7 @@
@Test
public void testZipOuputArchive () {
- KorAPDFATokenizer tok = new KorAPDFATokenizer();
+ DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
final ByteArrayOutputStream clearOut = new ByteArrayOutputStream();
System.setOut(new PrintStream(clearOut));
String[] tokens = tok.tokenize("Archive: ich/bin/ein.zip\n");
@@ -498,8 +498,11 @@
}
@Test
- public void testTextBreakOutputArchive () {
- KorAPDFATokenizer tok = new KorAPDFATokenizer(null, null, false, false, false, true, false, false);
+ public void testTextBreakOutputArchive () throws InstantiationException, IllegalAccessException, ClassNotFoundException {
+ DerekoDfaTokenizer tok = (DerekoDfaTokenizer) new KorapTokenizer.Builder()
+ .tokenizerClassName(DerekoDfaTokenizer.class.getName())
+ .printOffsets(true)
+ .build();
Span[] tokens = tok.tokenizePos("Text1\004\nText2 Hallo\004Rumsdibums\004Das freut mich sehr.\n");
assertEquals("Text1", tokens[0].getType());
assertEquals(tokens.length, 9 );
diff --git a/target/generated-sources/jflex/de/ids_mannheim/korap/tokenizer/KorAPDFATokenizer.java b/target/generated-sources/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.java
similarity index 99%
rename from target/generated-sources/jflex/de/ids_mannheim/korap/tokenizer/KorAPDFATokenizer.java
rename to target/generated-sources/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.java
index 7b45aa1..ab96a79 100644
--- a/target/generated-sources/jflex/de/ids_mannheim/korap/tokenizer/KorAPDFATokenizer.java
+++ b/target/generated-sources/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.java
@@ -1,6 +1,6 @@
// DO NOT EDIT
// Generated by JFlex 1.8.2 http://jflex.de/
-// source: src/main/jflex/de/ids_mannheim/korap/tokenizer/KorAPDFATokenizer.jflex
+// source: src/main/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.jflex
package de.ids_mannheim.korap.tokenizer;
/**
@@ -61,7 +61,7 @@
// See https://github.com/jflex-de/jflex/issues/222
@SuppressWarnings("FallThrough")
-public class KorAPDFATokenizer implements opennlp.tools.tokenize.Tokenizer, opennlp.tools.sentdetect.SentenceDetector {
+public class DerekoDfaTokenizer implements KorapTokenizer, opennlp.tools.tokenize.Tokenizer, opennlp.tools.sentdetect.SentenceDetector {
/** This character denotes the end of file. */
public static final int YYEOF = -1;
@@ -34137,6 +34137,7 @@
private boolean zzEOFDone;
/* user code: */
+ private static final CharSequence[] targetLanguages = {"de"};
private boolean xmlEcho = false;
private boolean normalize = false;
private boolean debug = false;
@@ -34145,36 +34146,64 @@
private long previousFileEndOffset = -1;
private int tokenId = 0;
private boolean atEOT = false;
- private boolean sentencize = false;
+ private boolean splitSentences = false;
private boolean echo = false;
- private boolean positions = false;
- private boolean tokens = false;
+ private boolean printOffsets = false;
+ private boolean printTokens = false;
private PrintStream outputStream = System.out;
- public KorAPDFATokenizer() {
+ public CharSequence[] getTargetLanguages() {
+ return targetLanguages;
+ }
+
+ public DerekoDfaTokenizer() {
this.zzReader = null;
}
- public KorAPDFATokenizer(java.io.Reader in, PrintStream outputStream, boolean echo, boolean tokens, boolean sentencize, boolean positions, boolean xmlEcho, boolean normalize) {
- this.zzReader = in;
- if (outputStream != null)
- this.outputStream = outputStream;
- this.tokens = tokens;
- this.sentencize = sentencize;
- this.positions = positions;
+ @Override
+ public void setInputReader(Reader inputReader) {
+ this.zzReader = inputReader;
+ }
+
+ @Override
+ public void setSplitSentences(boolean splitSentences) {
+ this.splitSentences = splitSentences;
+ }
+
+ @Override
+ public void setEcho(boolean echo) {
this.echo = echo;
- this.xmlEcho = xmlEcho;
+ }
+
+ @Override
+ public void setPrintOffsets(boolean printOffsets) {
+ this.printOffsets = printOffsets;
+ }
+
+ @Override
+ public void setPrintTokens(boolean printTokens) {
+ this.printTokens = printTokens;
+ }
+
+ @Override
+ public void setOutputStream(PrintStream outputStream) {
+ this.outputStream = outputStream;
+ }
+
+ @Override
+ public void setNormalize(boolean normalize) {
this.normalize = normalize;
}
- public void scanThrough() throws IOException {
+ @Override
+ public void scan() throws IOException {
List<Span> list = new ArrayList<Span>();
Span token;
while (!zzAtEOF) {
token = this.getNextToken();
if (atEOT) {
if (echo) {
- printTokenPositions(list, sentencize);
+ printTokenPositions(list, splitSentences);
list.clear();
}
atEOT = false;
@@ -34185,6 +34214,7 @@
}
}
+ @Override
public String[] tokenize(String s) {
Span[] spans;
int i;
@@ -34198,7 +34228,7 @@
return tokens;
}
- public void printTokenPositions(List<Span> spanList, boolean sentencize) {
+ void printTokenPositions(List<Span> spanList, boolean sentencize) {
int sentenceStart = -1;
StringBuilder tokenStringBuffer = new StringBuilder();
StringBuilder sentenceStringBuffer = new StringBuilder();
@@ -34206,7 +34236,7 @@
Span s = spanList.get(i);
if (sentenceStart == -1)
sentenceStart = s.getStart();
- if (positions) {
+ if (printOffsets) {
tokenStringBuffer.append(s.getStart())
.append(" ")
.append(s.getEnd());
@@ -34227,6 +34257,7 @@
outputStream.println(sentenceStringBuffer.toString());
}
+ @Override
public Span[] tokenizePos(String s) {
Span token;
int i = 0;
@@ -34238,7 +34269,7 @@
token = this.getNextToken();
if (atEOT) {
if (echo) {
- printTokenPositions(list, sentencize);
+ printTokenPositions(list, splitSentences);
list.clear();
}
atEOT = false;
@@ -34254,6 +34285,7 @@
return (list.toArray(new Span[list.size()]));
}
+ @Override
public String[] sentDetect(String s) {
Span[] spans;
int i;
@@ -34267,6 +34299,7 @@
return sentences;
}
+ @Override
public Span[] sentPosDetect(String s) {
final Span tokens[] = tokenizePos(s);
ArrayList<Span> sentences = new ArrayList<Span>();
@@ -34314,7 +34347,7 @@
to = (yychar - startOffset + yylength() - lengthDiff);
if (xmlEcho) {
outputStream.println("<span id=\"t_" + tokenId + "\" from=\"" + from + "\" to=\"" + to + "\"/>\n" + value);
- } else if (echo && tokens) {
+ } else if (echo && printTokens) {
outputStream.println(value);
}
startOffset += lengthDiff;
@@ -34368,7 +34401,7 @@
*
* @param in the java.io.Reader to read input from.
*/
- public KorAPDFATokenizer(java.io.Reader in) {
+ public DerekoDfaTokenizer(java.io.Reader in) {
this.zzReader = in;
}