Implement sentence splitter Change-Id: I80969a8ac68193dd2a3dd82c1f606807193c39c8

commit: b2666fc5a425f68797079e32c0e3cb6d25d056d7 [log] [tgz]
author: Marc Kupietz <kupietz@ids-mannheim.de> Mon Sep 07 18:03:46 2020 +0200
committer: Akron <nils@diewald-online.de> Tue Sep 08 17:25:08 2020 +0200
tree: bb2764a228d392c2127d92fee69188d1e0e1a2bb
parent: 07d97146420a0909c9d2c21ab21c95069b3a98b1 [diff]
diff --git a/Readme.md b/Readme.md
index 470d94b..f149435 100644
--- a/Readme.md
+++ b/Readme.md

@@ -1,5 +1,5 @@
 # KorAP Tokenizer
-Efficient, [OpenNLP tools](https://opennlp.apache.org) compatible DFA tokenizer with character offset output based on [JFlex](https://www.jflex.de/), suitable for German and other European languages.
+Efficient, [OpenNLP tools](https://opennlp.apache.org) compatible DFA tokenizer and sentence splitter with character offset output based on [JFlex](https://www.jflex.de/), suitable for German and other European languages.
 
 ## Description
 The KorAP tokenizer is used for the German Reference Corpus DeReKo. Being based on a finite state automaton, 
@@ -8,7 +8,8 @@
 so that this information can be used for applying standoff annotations.
  
 The main class `KorAPTokenizerImpl` implements the [`opennlp.tools.tokenize.Tokenizer`](https://opennlp.apache.org/docs/1.8.2/apidocs/opennlp-tools/opennlp/tools/tokenize/Tokenizer.html)
-interface and can thus be used as a drop-in replacement in OpenNLP applications.
+and [`opennlp.tools.sentdetect.SentenceDetector`](https://opennlp.apache.org/docs/1.8.2/apidocs/opennlp-tools/opennlp/tools/sentdetect/SentenceDetector.html)
+interfaces and can thus be used as a drop-in replacement in OpenNLP applications.
 
 The scanner is based on the Lucene scanner with modifications from [David Hall](https://github.com/dlwh).  
 
@@ -34,7 +35,7 @@
 
 In the default mode, the tokenizer prints all offsets of the first character of a token and the first character after a token.
 In order to end a text, flush the output and reset the character position, the magic escape sequence `\n\x03\n` .
-## Invocation Example
+#### Invocation Example
 ```
 $ echo -n -e 'This is a text.\x0a\x03\x0aAnd this is another text.\n\x03\n' |\
    java -jar target/KorAP-Tokenizer-1.2-SNAPSHOT.jar
@@ -42,6 +43,16 @@
 0 4 5 7 8 9 10 15 
 0 3 4 8 9 11 12 19 20 25 
 ```
+#### With sentence splitting
+```
+echo -n -e ' This ist a start of a text. And this is a sentence!!! But what the hack????\x0a\x03\x0aAnd this is another text.\n\x03\nAnd this a sentence without marker\n' |\
+   java -jar target/KorAP-Tokenizer-1.2-SNAPSHOT.jar -s
+1 5 6 9 10 11 12 17 18 20 21 22 23 27 27 28 29 32 33 37 38 40 41 42 43 51 51 54 55 58 59 63 64 67 68 72 72 76 
+1 28 29 54 55 76
+0 3 4 8 9 11 12 19 20 24 24 25 
+0 25
+```
+
 ## Development and License
 
 **Authors**: 

diff --git a/src/assembly/bin-distribution.xml b/src/assembly/bin-distribution.xml
index 5b4aca9..4f910ee 100644
--- a/src/assembly/bin-distribution.xml
+++ b/src/assembly/bin-distribution.xml

@@ -16,6 +16,7 @@
                 <includes>
                     <include>opennlp/tools/util/Span.class</include>
                     <include>opennlp/tools/tokenize/Tokenizer.class</include>
+                    <include>opennlp/tools/sentdetect/SentenceDetector.class</include>
                 </includes>
             </unpackOptions>
             <scope>runtime</scope>

diff --git a/src/main/jflex/de/ids_mannheim/korap/tokenizer/KorAPTokenizerImpl.jflex b/src/main/jflex/de/ids_mannheim/korap/tokenizer/KorAPTokenizerImpl.jflex
index ca51178..7aed433 100644
--- a/src/main/jflex/de/ids_mannheim/korap/tokenizer/KorAPTokenizerImpl.jflex
+++ b/src/main/jflex/de/ids_mannheim/korap/tokenizer/KorAPTokenizerImpl.jflex

@@ -67,7 +67,7 @@
 %class KorAPTokenizerImpl
 %unicode
 %public
-%implements opennlp.tools.tokenize.Tokenizer
+%implements opennlp.tools.tokenize.Tokenizer, opennlp.tools.sentdetect.SentenceDetector
 %type Span
 %function getNextToken
 %char
@@ -75,21 +75,26 @@
 %{
 
 	public boolean xmlEcho = false;
+    public boolean sentences = false;
 	public boolean normalize = false;
 	public boolean debug = false;
+	private boolean newSentence = true;
 	private long startOffset = 0;
 	private int tokenId = 0;
 	private StringBuffer bounds = null;
-	
+	private long fallbackSentenceEndOffset = -1;
+	private StringBuffer sentenceBounds = null;
+
   public KorAPTokenizerImpl() {
     this.zzReader = null;
+    sentenceBounds = null;
   }
 
 	public String[] tokenize(String s) {
 		Span[] spans;
 		int i;
 		String[] tokens;
-		
+
 		spans = tokenizePos(s);
 		tokens = new String[spans.length];
 		for(i=0; i<spans.length; i++) {
@@ -97,7 +102,7 @@
 		}
 		return tokens;
 	}
-	
+
 	public Span[] tokenizePos(String s) {
 		Span token;
 		int i=0;
@@ -110,7 +115,7 @@
 				if(token != null) {
 					list.add(token);
 				}
-			} 
+			}
 		} catch (java.io.IOException e) {
 			System.out.println("IO error scanning "+s);
 			System.out.println(e);
@@ -118,11 +123,41 @@
 		return(list.toArray(new Span[list.size()]));
 	}
 
+	public String[] sentDetect(String s) {
+     Span[] spans;
+     int i;
+     String[] sentences;
+
+     spans = sentPosDetect(s);
+     sentences = new String[spans.length];
+     for (i = 0; i < spans.length; i++) {
+         sentences[i] = spans[i].getType();
+     }
+     return sentences;
+  }
+
+  public Span[] sentPosDetect(String s) {
+    final Span tokens[] = tokenizePos(s);
+    ArrayList<Span> sentences = new ArrayList<Span>();
+    int sentenceStart = 0;
+    if (tokens.length > 0)
+        tokens[0].getStart();
+    for (int i = 0; i < tokens.length; i++) {
+        if (tokens[i].getType().matches("^[.?!]+$") || i == tokens.length - 1) {
+            sentences.add(new Span(sentenceStart, tokens[i].getEnd(), s.substring(sentenceStart, tokens[i].getEnd())));
+            if (i < tokens.length - 1) {
+                sentenceStart = tokens[i + 1].getStart();
+            }
+        }
+    }
+    return sentences.toArray(new Span[0]);
+  }
+
 	public int[] tokenizeMilestones(String s) {
 		Span[] spans;
 		int i;
 		int[] milestones;
-		
+
 		spans = tokenizePos(s);
 		milestones = new int[2*spans.length];
 		for(i=0; i<spans.length; i++) {
@@ -135,11 +170,15 @@
 	public final long yychar()	{
     return yychar;
 	}
-	
+
 	final Span  currentToken() {
     return currentToken(yytext());
 	}
-	
+
+	public boolean isSentenceBound(String s) {
+        return s.matches("^[.?!]+$");
+    }
+
 	final Span currentToken(String normalizedValue) {
 		String value;
 		long lengthDiff=0;
@@ -165,16 +204,37 @@
 				System.err.println(from+"-"+to+":"+ value);
 			}
 			bounds.append(from+" "+to+" ");
-		}
-		return new Span((int)from, (int)to, value);
+            if (sentences) {
+                if (newSentence) {
+                    if (sentenceBounds.length() != 0)
+                        sentenceBounds.append(" ");
+                    sentenceBounds.append(from);
+                    newSentence = false;
+                }
+                if (isSentenceBound(value)) {
+                    sentenceBounds.append(" " + to);
+                    fallbackSentenceEndOffset = -1;
+                    newSentence = true;
+                } else {
+                    fallbackSentenceEndOffset = to;
+                }
+            }
+        }
+        return new Span((int)from, (int)to, value);
 	}
-	
+
 	final void fileEnd() {
 		startOffset = yychar+yylength();
 		tokenId=0;
 		if(bounds != null && !xmlEcho) {
 			System.out.println(bounds.toString());
+            if (sentences && sentenceBounds != null) {
+                if (fallbackSentenceEndOffset != -1)
+                    sentenceBounds.append(" "+fallbackSentenceEndOffset);
+                System.out.println(sentenceBounds.toString());
+            }
 			bounds.setLength(0);
+            sentenceBounds.setLength(0);
 		}
 	}
 
@@ -191,7 +251,7 @@
 			return currentToken();
 		}
 	}
-	
+
 	final void zipArchive() {
 		String name;
 		String matched = yytext();
@@ -213,16 +273,19 @@
 		int j=0;
 		boolean xmlout = false;
 		boolean normalize = false;
+        boolean sentences = false;
 
 		for (int i = 0; i < argv.length && argv[i].indexOf("-") == 0; i++) {
 			if(argv[i].equals("-ktt")) { // act as a tokenizer for KorAP TreeTagger
-				xmlout=true; 
+				xmlout=true;
 			} else if(argv[i].equals("-n")) { // do some normailization
-				normalize=true; 
-			}
+				normalize=true;
+			}  else if(argv[i].equals("-s")) { // detect sentence boundaries
+        sentences=true;
+      }
 			j++;
 		}
-		
+
 		for (int i = j; i < argv.length || (i == j && argv.length == j); i++) {
 			KorAPTokenizerImpl scanner = null;
 			String fn = (argv.length > j ? argv[i] : "-");
@@ -231,8 +294,10 @@
 		        new BufferedReader(new java.io.FileReader(fn));
 				scanner = new KorAPTokenizerImpl(br);
 				scanner.bounds = new StringBuffer(1280000);
+        scanner.sentenceBounds = new StringBuffer(128000);
 				scanner.xmlEcho=xmlout;
 				scanner.normalize=normalize;
+				scanner.sentences=sentences;
 				while ( !scanner.zzAtEOF ) { scanner.getNextToken(); }
 			}
 			catch (java.io.FileNotFoundException e) {
@@ -248,8 +313,6 @@
 			}
 		}
   }
-
-
 %}
 
 THAI       = [\u0E00-\u0E59]

diff --git a/src/test/java/de/ids_mannheim/korap/tokenizer/SentenceSplitterTest.java b/src/test/java/de/ids_mannheim/korap/tokenizer/SentenceSplitterTest.java
new file mode 100644
index 0000000..cc5d4a5
--- /dev/null
+++ b/src/test/java/de/ids_mannheim/korap/tokenizer/SentenceSplitterTest.java

@@ -0,0 +1,104 @@
+package de.ids_mannheim.korap.tokenizer;
+
+import org.junit.Ignore;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+
+import static org.junit.Assert.assertEquals;
+
+@RunWith(JUnit4.class)
+public class SentenceSplitterTest {
+
+    @Test
+    public void testSentSplitterSimple () {
+        KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
+        String[] sentences = tok.sentDetect("Der alte Mann.");
+        assertEquals(sentences.length, 1);
+    }
+
+    @Test
+    public void testSentSplitterAbbr () {
+        KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
+        String[] sentences = tok.sentDetect("Der Vorsitzende der Abk. hat gewählt.");
+        assertEquals(sentences.length, 1);
+    }
+
+    @Test
+    public void testSentSplitterHost1 () {
+        KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
+        String[] sentences = tok.sentDetect("Gefunden auf wikipedia.org.");
+        assertEquals(sentences.length, 1);
+    }
+
+    @Test
+    @Ignore
+    public void testSentSplitterHost2 () {
+        KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
+        String[] sentences = tok.sentDetect("Gefunden auf www.wikipedia.org");
+        assertEquals(sentences.length, 1);
+    }
+
+    @Test
+    public void testSentSplitterEmail1 () {
+        KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
+        String[] sentences = tok.sentDetect("Ich bin unter korap@ids-mannheim.de erreichbar.");
+        assertEquals(sentences.length, 1);
+    }
+
+
+    @Test
+    public void testSentSplitterWeb1 () {
+        KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
+        String[] sentences = tok.sentDetect("Unsere Website ist https://korap.ids-mannheim.de/?q=Baum");
+        assertEquals(sentences.length, 1);
+    }
+
+   @Test
+    public void testSentSplitterServer () {
+        KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
+        String[] sentences = tok.sentDetect("Unser Server ist 10.0.10.51.");
+        assertEquals(sentences.length, 1);
+    }
+
+    @Test
+    public void testSentSplitterNum () {
+        KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
+        String[] sentences = tok.sentDetect("Zu 50.4% ist es sicher");
+        assertEquals(sentences.length, 1);
+    }
+
+    @Test
+    public void testSentSplitterDate () {
+        KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
+        String[] sentences = tok.sentDetect("Der Termin ist am 5.9.2018");
+        assertEquals(sentences.length, 1);
+    }
+
+    @Test
+    // Probably interpreted as HOST
+    public void testSentSplitterFileExtension1 () {
+        KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
+        String[] sentences = tok.sentDetect("Ich habe die readme.txt heruntergeladen");
+        assertEquals(sentences.length, 1);
+    }
+
+    @Test
+    public void testSentMultiMarker () {
+        KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
+        String[] sentences = tok.sentDetect("Ausschalten!!! Hast Du nicht gehört???");
+        assertEquals("Ausschalten!!!", sentences[0]);
+        assertEquals("Hast Du nicht gehört???", sentences[1]);
+        assertEquals(sentences.length, 2);
+    }
+
+    @Test
+    @Ignore
+    public void testSentSplitterQuote () {
+        KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
+        String[] sentences = tok.sentDetect("\"Ausschalten!!!\", sagte er. \"Hast Du nicht gehört???\"");
+        assertEquals("\"Ausschalten!!!\", sagte er.", sentences[0]);
+        assertEquals("\"Hast Du nicht gehört???\"", sentences[1]);
+        assertEquals(sentences.length, 2);
+    }
+}

diff --git a/target/generated-sources/jflex/de/ids_mannheim/korap/tokenizer/KorAPTokenizerImpl.java b/target/generated-sources/jflex/de/ids_mannheim/korap/tokenizer/KorAPTokenizerImpl.java
index 5711417..0474ef8 100644
--- a/target/generated-sources/jflex/de/ids_mannheim/korap/tokenizer/KorAPTokenizerImpl.java
+++ b/target/generated-sources/jflex/de/ids_mannheim/korap/tokenizer/KorAPTokenizerImpl.java

@@ -58,13 +58,14 @@
 import java.io.StringReader;
 import java.io.InputStreamReader;
 import java.lang.StringBuffer;
+import java.lang.reflect.Array;
 import java.util.ArrayList;
 import java.util.List;
 import opennlp.tools.util.Span;
 
 // See https://github.com/jflex-de/jflex/issues/222
 @SuppressWarnings("FallThrough")
-public class KorAPTokenizerImpl implements opennlp.tools.tokenize.Tokenizer {
+public class KorAPTokenizerImpl implements opennlp.tools.tokenize.Tokenizer, opennlp.tools.sentdetect.SentenceDetector {
 
   /** This character denotes the end of file. */
   public static final int YYEOF = -1;
@@ -35559,14 +35560,19 @@
   /* user code: */
 
 	public boolean xmlEcho = false;
+    public boolean sentences = false;
 	public boolean normalize = false;
 	public boolean debug = false;
-	private long startOffset = 0;
+	private boolean newSentence = true;
+    private long fallbackSentenceEndOffset = -1;
+    private long startOffset = 0;
 	private int tokenId = 0;
 	private StringBuffer bounds = null;
-	
+    private StringBuffer sentenceBounds = null;
+
   public KorAPTokenizerImpl() {
     this.zzReader = null;
+    sentenceBounds = new StringBuffer("");
   }
 
 	public String[] tokenize(String s) {
@@ -35623,6 +35629,10 @@
 	final Span  currentToken() {
     return currentToken(yytext());
 	}
+
+	public boolean isSentenceBound(String s) {
+        return s.matches("^[.?!]+$");
+    }
 	
 	final Span currentToken(String normalizedValue) {
 		String value;
@@ -35649,8 +35659,22 @@
 				System.err.println(from+"-"+to+":"+ value);
 			}
 			bounds.append(from+" "+to+" ");
-		}
-		return new Span((int)from, (int)to, value);
+            if (sentences) {
+                if (newSentence) {
+                    if (sentenceBounds.length() != 0)
+                        sentenceBounds.append(" ");
+                    sentenceBounds.append(from);
+                    newSentence = false;
+                }
+                if (isSentenceBound(value)) {
+                    sentenceBounds.append(" " + to);
+                    newSentence = true;
+                } else {
+                    fallbackSentenceEndOffset = to;
+                }
+            }
+        }
+        return new Span((int)from, (int)to, value);
 	}
 	
 	final void fileEnd() {
@@ -35658,7 +35682,13 @@
 		tokenId=0;
 		if(bounds != null && !xmlEcho) {
 			System.out.println(bounds.toString());
+            if (fallbackSentenceEndOffset != -1)
+                sentenceBounds.append(" "+fallbackSentenceEndOffset);
+            if (sentences && sentenceBounds != null) {
+                System.out.println(sentenceBounds.toString());
+            }
 			bounds.setLength(0);
+            sentenceBounds.setLength(0);
 		}
 	}
 
@@ -35697,13 +35727,16 @@
 		int j=0;
 		boolean xmlout = false;
 		boolean normalize = false;
+        boolean sentences = false;
 
 		for (int i = 0; i < argv.length && argv[i].indexOf("-") == 0; i++) {
 			if(argv[i].equals("-ktt")) { // act as a tokenizer for KorAP TreeTagger
 				xmlout=true; 
 			} else if(argv[i].equals("-n")) { // do some normailization
 				normalize=true; 
-			}
+			}  else if(argv[i].equals("-s")) { // detect sentence boundaries
+                sentences=true;
+            }
 			j++;
 		}
 		
@@ -35715,8 +35748,10 @@
 		        new BufferedReader(new java.io.FileReader(fn));
 				scanner = new KorAPTokenizerImpl(br);
 				scanner.bounds = new StringBuffer(1280000);
+                scanner.sentenceBounds = new StringBuffer(128000);
 				scanner.xmlEcho=xmlout;
 				scanner.normalize=normalize;
+				scanner.sentences=sentences;
 				while ( !scanner.zzAtEOF ) { scanner.getNextToken(); }
 			}
 			catch (java.io.FileNotFoundException e) {
@@ -35733,9 +35768,6 @@
 		}
   }
 
-
-
-
   /**
    * Creates a new scanner
    *
@@ -36311,4 +36343,33 @@
   }
 
 
+    public String[] sentDetect(String s) {
+        Span[] spans;
+        int i;
+        String[] sentences;
+
+        spans = sentPosDetect(s);
+        sentences = new String[spans.length];
+        for (i = 0; i < spans.length; i++) {
+            sentences[i] = spans[i].getType();
+        }
+        return sentences;
+    }
+
+    public Span[] sentPosDetect(String s) {
+        final Span tokens[] = tokenizePos(s);
+        ArrayList<Span> sentences = new ArrayList<Span>();
+        int sentenceStart = 0;
+        if (tokens.length > 0)
+            tokens[0].getStart();
+        for (int i = 0; i < tokens.length; i++) {
+            if (tokens[i].getType().matches("^[.?!]+$") || i == tokens.length - 1) {
+                sentences.add(new Span(sentenceStart, tokens[i].getEnd(), s.substring(sentenceStart, tokens[i].getEnd())));
+                if (i < tokens.length - 1) {
+                    sentenceStart = tokens[i + 1].getStart();
+                }
+            }
+        }
+        return sentences.toArray(new Span[0]);
+    }
 }
commit	b2666fc5a425f68797079e32c0e3cb6d25d056d7	[log] [tgz]
author	Marc Kupietz <kupietz@ids-mannheim.de>	Mon Sep 07 18:03:46 2020 +0200
committer	Akron <nils@diewald-online.de>	Tue Sep 08 17:25:08 2020 +0200
tree	bb2764a228d392c2127d92fee69188d1e0e1a2bb
parent	07d97146420a0909c9d2c21ab21c95069b3a98b1 [diff]