Implement sentence splitter
Change-Id: I80969a8ac68193dd2a3dd82c1f606807193c39c8
diff --git a/Readme.md b/Readme.md
index 470d94b..f149435 100644
--- a/Readme.md
+++ b/Readme.md
@@ -1,5 +1,5 @@
# KorAP Tokenizer
-Efficient, [OpenNLP tools](https://opennlp.apache.org) compatible DFA tokenizer with character offset output based on [JFlex](https://www.jflex.de/), suitable for German and other European languages.
+Efficient, [OpenNLP tools](https://opennlp.apache.org) compatible DFA tokenizer and sentence splitter with character offset output based on [JFlex](https://www.jflex.de/), suitable for German and other European languages.
## Description
The KorAP tokenizer is used for the German Reference Corpus DeReKo. Being based on a finite state automaton,
@@ -8,7 +8,8 @@
so that this information can be used for applying standoff annotations.
The main class `KorAPTokenizerImpl` implements the [`opennlp.tools.tokenize.Tokenizer`](https://opennlp.apache.org/docs/1.8.2/apidocs/opennlp-tools/opennlp/tools/tokenize/Tokenizer.html)
-interface and can thus be used as a drop-in replacement in OpenNLP applications.
+and [`opennlp.tools.sentdetect.SentenceDetector`](https://opennlp.apache.org/docs/1.8.2/apidocs/opennlp-tools/opennlp/tools/sentdetect/SentenceDetector.html)
+interfaces and can thus be used as a drop-in replacement in OpenNLP applications.
The scanner is based on the Lucene scanner with modifications from [David Hall](https://github.com/dlwh).
@@ -34,7 +35,7 @@
In the default mode, the tokenizer prints all offsets of the first character of a token and the first character after a token.
In order to end a text, flush the output and reset the character position, the magic escape sequence `\n\x03\n` .
-## Invocation Example
+#### Invocation Example
```
$ echo -n -e 'This is a text.\x0a\x03\x0aAnd this is another text.\n\x03\n' |\
java -jar target/KorAP-Tokenizer-1.2-SNAPSHOT.jar
@@ -42,6 +43,16 @@
0 4 5 7 8 9 10 15
0 3 4 8 9 11 12 19 20 25
```
+#### With sentence splitting
+```
+echo -n -e ' This ist a start of a text. And this is a sentence!!! But what the hack????\x0a\x03\x0aAnd this is another text.\n\x03\nAnd this a sentence without marker\n' |\
+ java -jar target/KorAP-Tokenizer-1.2-SNAPSHOT.jar -s
+1 5 6 9 10 11 12 17 18 20 21 22 23 27 27 28 29 32 33 37 38 40 41 42 43 51 51 54 55 58 59 63 64 67 68 72 72 76
+1 28 29 54 55 76
+0 3 4 8 9 11 12 19 20 24 24 25
+0 25
+```
+
## Development and License
**Authors**:
diff --git a/src/assembly/bin-distribution.xml b/src/assembly/bin-distribution.xml
index 5b4aca9..4f910ee 100644
--- a/src/assembly/bin-distribution.xml
+++ b/src/assembly/bin-distribution.xml
@@ -16,6 +16,7 @@
<includes>
<include>opennlp/tools/util/Span.class</include>
<include>opennlp/tools/tokenize/Tokenizer.class</include>
+ <include>opennlp/tools/sentdetect/SentenceDetector.class</include>
</includes>
</unpackOptions>
<scope>runtime</scope>
diff --git a/src/main/jflex/de/ids_mannheim/korap/tokenizer/KorAPTokenizerImpl.jflex b/src/main/jflex/de/ids_mannheim/korap/tokenizer/KorAPTokenizerImpl.jflex
index ca51178..7aed433 100644
--- a/src/main/jflex/de/ids_mannheim/korap/tokenizer/KorAPTokenizerImpl.jflex
+++ b/src/main/jflex/de/ids_mannheim/korap/tokenizer/KorAPTokenizerImpl.jflex
@@ -67,7 +67,7 @@
%class KorAPTokenizerImpl
%unicode
%public
-%implements opennlp.tools.tokenize.Tokenizer
+%implements opennlp.tools.tokenize.Tokenizer, opennlp.tools.sentdetect.SentenceDetector
%type Span
%function getNextToken
%char
@@ -75,21 +75,26 @@
%{
public boolean xmlEcho = false;
+ public boolean sentences = false;
public boolean normalize = false;
public boolean debug = false;
+ private boolean newSentence = true;
private long startOffset = 0;
private int tokenId = 0;
private StringBuffer bounds = null;
-
+ private long fallbackSentenceEndOffset = -1;
+ private StringBuffer sentenceBounds = null;
+
public KorAPTokenizerImpl() {
this.zzReader = null;
+ sentenceBounds = null;
}
public String[] tokenize(String s) {
Span[] spans;
int i;
String[] tokens;
-
+
spans = tokenizePos(s);
tokens = new String[spans.length];
for(i=0; i<spans.length; i++) {
@@ -97,7 +102,7 @@
}
return tokens;
}
-
+
public Span[] tokenizePos(String s) {
Span token;
int i=0;
@@ -110,7 +115,7 @@
if(token != null) {
list.add(token);
}
- }
+ }
} catch (java.io.IOException e) {
System.out.println("IO error scanning "+s);
System.out.println(e);
@@ -118,11 +123,41 @@
return(list.toArray(new Span[list.size()]));
}
+ public String[] sentDetect(String s) {
+ Span[] spans;
+ int i;
+ String[] sentences;
+
+ spans = sentPosDetect(s);
+ sentences = new String[spans.length];
+ for (i = 0; i < spans.length; i++) {
+ sentences[i] = spans[i].getType();
+ }
+ return sentences;
+ }
+
+ public Span[] sentPosDetect(String s) {
+ final Span tokens[] = tokenizePos(s);
+ ArrayList<Span> sentences = new ArrayList<Span>();
+ int sentenceStart = 0;
+ if (tokens.length > 0)
+ tokens[0].getStart();
+ for (int i = 0; i < tokens.length; i++) {
+ if (tokens[i].getType().matches("^[.?!]+$") || i == tokens.length - 1) {
+ sentences.add(new Span(sentenceStart, tokens[i].getEnd(), s.substring(sentenceStart, tokens[i].getEnd())));
+ if (i < tokens.length - 1) {
+ sentenceStart = tokens[i + 1].getStart();
+ }
+ }
+ }
+ return sentences.toArray(new Span[0]);
+ }
+
public int[] tokenizeMilestones(String s) {
Span[] spans;
int i;
int[] milestones;
-
+
spans = tokenizePos(s);
milestones = new int[2*spans.length];
for(i=0; i<spans.length; i++) {
@@ -135,11 +170,15 @@
public final long yychar() {
return yychar;
}
-
+
final Span currentToken() {
return currentToken(yytext());
}
-
+
+ public boolean isSentenceBound(String s) {
+ return s.matches("^[.?!]+$");
+ }
+
final Span currentToken(String normalizedValue) {
String value;
long lengthDiff=0;
@@ -165,16 +204,37 @@
System.err.println(from+"-"+to+":"+ value);
}
bounds.append(from+" "+to+" ");
- }
- return new Span((int)from, (int)to, value);
+ if (sentences) {
+ if (newSentence) {
+ if (sentenceBounds.length() != 0)
+ sentenceBounds.append(" ");
+ sentenceBounds.append(from);
+ newSentence = false;
+ }
+ if (isSentenceBound(value)) {
+ sentenceBounds.append(" " + to);
+ fallbackSentenceEndOffset = -1;
+ newSentence = true;
+ } else {
+ fallbackSentenceEndOffset = to;
+ }
+ }
+ }
+ return new Span((int)from, (int)to, value);
}
-
+
final void fileEnd() {
startOffset = yychar+yylength();
tokenId=0;
if(bounds != null && !xmlEcho) {
System.out.println(bounds.toString());
+ if (sentences && sentenceBounds != null) {
+ if (fallbackSentenceEndOffset != -1)
+ sentenceBounds.append(" "+fallbackSentenceEndOffset);
+ System.out.println(sentenceBounds.toString());
+ }
bounds.setLength(0);
+ sentenceBounds.setLength(0);
}
}
@@ -191,7 +251,7 @@
return currentToken();
}
}
-
+
final void zipArchive() {
String name;
String matched = yytext();
@@ -213,16 +273,19 @@
int j=0;
boolean xmlout = false;
boolean normalize = false;
+ boolean sentences = false;
for (int i = 0; i < argv.length && argv[i].indexOf("-") == 0; i++) {
if(argv[i].equals("-ktt")) { // act as a tokenizer for KorAP TreeTagger
- xmlout=true;
+ xmlout=true;
} else if(argv[i].equals("-n")) { // do some normailization
- normalize=true;
- }
+ normalize=true;
+ } else if(argv[i].equals("-s")) { // detect sentence boundaries
+ sentences=true;
+ }
j++;
}
-
+
for (int i = j; i < argv.length || (i == j && argv.length == j); i++) {
KorAPTokenizerImpl scanner = null;
String fn = (argv.length > j ? argv[i] : "-");
@@ -231,8 +294,10 @@
new BufferedReader(new java.io.FileReader(fn));
scanner = new KorAPTokenizerImpl(br);
scanner.bounds = new StringBuffer(1280000);
+ scanner.sentenceBounds = new StringBuffer(128000);
scanner.xmlEcho=xmlout;
scanner.normalize=normalize;
+ scanner.sentences=sentences;
while ( !scanner.zzAtEOF ) { scanner.getNextToken(); }
}
catch (java.io.FileNotFoundException e) {
@@ -248,8 +313,6 @@
}
}
}
-
-
%}
THAI = [\u0E00-\u0E59]
diff --git a/src/test/java/de/ids_mannheim/korap/tokenizer/SentenceSplitterTest.java b/src/test/java/de/ids_mannheim/korap/tokenizer/SentenceSplitterTest.java
new file mode 100644
index 0000000..cc5d4a5
--- /dev/null
+++ b/src/test/java/de/ids_mannheim/korap/tokenizer/SentenceSplitterTest.java
@@ -0,0 +1,104 @@
+package de.ids_mannheim.korap.tokenizer;
+
+import org.junit.Ignore;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+
+import static org.junit.Assert.assertEquals;
+
+@RunWith(JUnit4.class)
+public class SentenceSplitterTest {
+
+ @Test
+ public void testSentSplitterSimple () {
+ KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
+ String[] sentences = tok.sentDetect("Der alte Mann.");
+ assertEquals(sentences.length, 1);
+ }
+
+ @Test
+ public void testSentSplitterAbbr () {
+ KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
+ String[] sentences = tok.sentDetect("Der Vorsitzende der Abk. hat gewählt.");
+ assertEquals(sentences.length, 1);
+ }
+
+ @Test
+ public void testSentSplitterHost1 () {
+ KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
+ String[] sentences = tok.sentDetect("Gefunden auf wikipedia.org.");
+ assertEquals(sentences.length, 1);
+ }
+
+ @Test
+ @Ignore
+ public void testSentSplitterHost2 () {
+ KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
+ String[] sentences = tok.sentDetect("Gefunden auf www.wikipedia.org");
+ assertEquals(sentences.length, 1);
+ }
+
+ @Test
+ public void testSentSplitterEmail1 () {
+ KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
+ String[] sentences = tok.sentDetect("Ich bin unter korap@ids-mannheim.de erreichbar.");
+ assertEquals(sentences.length, 1);
+ }
+
+
+ @Test
+ public void testSentSplitterWeb1 () {
+ KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
+ String[] sentences = tok.sentDetect("Unsere Website ist https://korap.ids-mannheim.de/?q=Baum");
+ assertEquals(sentences.length, 1);
+ }
+
+ @Test
+ public void testSentSplitterServer () {
+ KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
+ String[] sentences = tok.sentDetect("Unser Server ist 10.0.10.51.");
+ assertEquals(sentences.length, 1);
+ }
+
+ @Test
+ public void testSentSplitterNum () {
+ KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
+ String[] sentences = tok.sentDetect("Zu 50.4% ist es sicher");
+ assertEquals(sentences.length, 1);
+ }
+
+ @Test
+ public void testSentSplitterDate () {
+ KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
+ String[] sentences = tok.sentDetect("Der Termin ist am 5.9.2018");
+ assertEquals(sentences.length, 1);
+ }
+
+ @Test
+ // Probably interpreted as HOST
+ public void testSentSplitterFileExtension1 () {
+ KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
+ String[] sentences = tok.sentDetect("Ich habe die readme.txt heruntergeladen");
+ assertEquals(sentences.length, 1);
+ }
+
+ @Test
+ public void testSentMultiMarker () {
+ KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
+ String[] sentences = tok.sentDetect("Ausschalten!!! Hast Du nicht gehört???");
+ assertEquals("Ausschalten!!!", sentences[0]);
+ assertEquals("Hast Du nicht gehört???", sentences[1]);
+ assertEquals(sentences.length, 2);
+ }
+
+ @Test
+ @Ignore
+ public void testSentSplitterQuote () {
+ KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
+ String[] sentences = tok.sentDetect("\"Ausschalten!!!\", sagte er. \"Hast Du nicht gehört???\"");
+ assertEquals("\"Ausschalten!!!\", sagte er.", sentences[0]);
+ assertEquals("\"Hast Du nicht gehört???\"", sentences[1]);
+ assertEquals(sentences.length, 2);
+ }
+}
diff --git a/target/generated-sources/jflex/de/ids_mannheim/korap/tokenizer/KorAPTokenizerImpl.java b/target/generated-sources/jflex/de/ids_mannheim/korap/tokenizer/KorAPTokenizerImpl.java
index 5711417..0474ef8 100644
--- a/target/generated-sources/jflex/de/ids_mannheim/korap/tokenizer/KorAPTokenizerImpl.java
+++ b/target/generated-sources/jflex/de/ids_mannheim/korap/tokenizer/KorAPTokenizerImpl.java
@@ -58,13 +58,14 @@
import java.io.StringReader;
import java.io.InputStreamReader;
import java.lang.StringBuffer;
+import java.lang.reflect.Array;
import java.util.ArrayList;
import java.util.List;
import opennlp.tools.util.Span;
// See https://github.com/jflex-de/jflex/issues/222
@SuppressWarnings("FallThrough")
-public class KorAPTokenizerImpl implements opennlp.tools.tokenize.Tokenizer {
+public class KorAPTokenizerImpl implements opennlp.tools.tokenize.Tokenizer, opennlp.tools.sentdetect.SentenceDetector {
/** This character denotes the end of file. */
public static final int YYEOF = -1;
@@ -35559,14 +35560,19 @@
/* user code: */
public boolean xmlEcho = false;
+ public boolean sentences = false;
public boolean normalize = false;
public boolean debug = false;
- private long startOffset = 0;
+ private boolean newSentence = true;
+ private long fallbackSentenceEndOffset = -1;
+ private long startOffset = 0;
private int tokenId = 0;
private StringBuffer bounds = null;
-
+ private StringBuffer sentenceBounds = null;
+
public KorAPTokenizerImpl() {
this.zzReader = null;
+ sentenceBounds = new StringBuffer("");
}
public String[] tokenize(String s) {
@@ -35623,6 +35629,10 @@
final Span currentToken() {
return currentToken(yytext());
}
+
+ public boolean isSentenceBound(String s) {
+ return s.matches("^[.?!]+$");
+ }
final Span currentToken(String normalizedValue) {
String value;
@@ -35649,8 +35659,22 @@
System.err.println(from+"-"+to+":"+ value);
}
bounds.append(from+" "+to+" ");
- }
- return new Span((int)from, (int)to, value);
+ if (sentences) {
+ if (newSentence) {
+ if (sentenceBounds.length() != 0)
+ sentenceBounds.append(" ");
+ sentenceBounds.append(from);
+ newSentence = false;
+ }
+ if (isSentenceBound(value)) {
+ sentenceBounds.append(" " + to);
+ newSentence = true;
+ } else {
+ fallbackSentenceEndOffset = to;
+ }
+ }
+ }
+ return new Span((int)from, (int)to, value);
}
final void fileEnd() {
@@ -35658,7 +35682,13 @@
tokenId=0;
if(bounds != null && !xmlEcho) {
System.out.println(bounds.toString());
+ if (fallbackSentenceEndOffset != -1)
+ sentenceBounds.append(" "+fallbackSentenceEndOffset);
+ if (sentences && sentenceBounds != null) {
+ System.out.println(sentenceBounds.toString());
+ }
bounds.setLength(0);
+ sentenceBounds.setLength(0);
}
}
@@ -35697,13 +35727,16 @@
int j=0;
boolean xmlout = false;
boolean normalize = false;
+ boolean sentences = false;
for (int i = 0; i < argv.length && argv[i].indexOf("-") == 0; i++) {
if(argv[i].equals("-ktt")) { // act as a tokenizer for KorAP TreeTagger
xmlout=true;
} else if(argv[i].equals("-n")) { // do some normailization
normalize=true;
- }
+ } else if(argv[i].equals("-s")) { // detect sentence boundaries
+ sentences=true;
+ }
j++;
}
@@ -35715,8 +35748,10 @@
new BufferedReader(new java.io.FileReader(fn));
scanner = new KorAPTokenizerImpl(br);
scanner.bounds = new StringBuffer(1280000);
+ scanner.sentenceBounds = new StringBuffer(128000);
scanner.xmlEcho=xmlout;
scanner.normalize=normalize;
+ scanner.sentences=sentences;
while ( !scanner.zzAtEOF ) { scanner.getNextToken(); }
}
catch (java.io.FileNotFoundException e) {
@@ -35733,9 +35768,6 @@
}
}
-
-
-
/**
* Creates a new scanner
*
@@ -36311,4 +36343,33 @@
}
+ public String[] sentDetect(String s) {
+ Span[] spans;
+ int i;
+ String[] sentences;
+
+ spans = sentPosDetect(s);
+ sentences = new String[spans.length];
+ for (i = 0; i < spans.length; i++) {
+ sentences[i] = spans[i].getType();
+ }
+ return sentences;
+ }
+
+ public Span[] sentPosDetect(String s) {
+ final Span tokens[] = tokenizePos(s);
+ ArrayList<Span> sentences = new ArrayList<Span>();
+ int sentenceStart = 0;
+ if (tokens.length > 0)
+ tokens[0].getStart();
+ for (int i = 0; i < tokens.length; i++) {
+ if (tokens[i].getType().matches("^[.?!]+$") || i == tokens.length - 1) {
+ sentences.add(new Span(sentenceStart, tokens[i].getEnd(), s.substring(sentenceStart, tokens[i].getEnd())));
+ if (i < tokens.length - 1) {
+ sentenceStart = tokens[i + 1].getStart();
+ }
+ }
+ }
+ return sentences.toArray(new Span[0]);
+ }
}