Add new command line options using picocli and sanitize code
Usage: koraptokenizer [-hnpsV] [--force] [-ktt] [--[no-]tokens]
[-o=<output_fillename>] [<inputFiles>...]
Tokenizes (and sentence splits) text input.
[<inputFiles>...] input files
--force Force overwrite (default: false)
-h, --help Show this help message and exit.
-ktt Deprecated. For internal use only. (default: false)
-n, --normalize Normalize tokens (default: false)
--[no-]tokens Print tokens (default: true)
-o, --output-file=<output_fillename>
Output file (default: -)
-p, --positions Print token start and end positions as character
offsets (default: false)
-s, --sentence-boundaries
Print sentence boundary positions (default: false)
-V, --version Print version information and exit.
Change-Id: Ib92678c832a2d95799a8f503c3e86dd4da2b4d73
diff --git a/Readme.md b/Readme.md
index 84d6051..285866b 100644
--- a/Readme.md
+++ b/Readme.md
@@ -32,14 +32,14 @@
you will need ad least 5 GB of free RAM.
## Documentation
-The KorAP tokenizer reads from standard input and writes to standard output. It currently supports two modes.
+The KorAP tokenizer reads from standard input and writes to standard output. It supports multiple modes of operations.
-In the default mode, the tokenizer prints all offsets of the first character of a token and the first character after a token.
+With the `--positions` option, for example, the tokenizer prints all offsets of the first character of a token and the first character after a token.
In order to end a text, flush the output and reset the character position, an EOT character (0x04) can be used.
#### Command Line Invocation
```
$ echo -n -e 'This is a text.\x0a\x03\x0aAnd this is another text.\n\x03\n' |\
- java -jar target/KorAP-Tokenizer-1.2-SNAPSHOT.jar
+ java -jar target/KorAP-Tokenizer-1.3-SNAPSHOT.jar --positions
0 4 5 7 8 9 10 15
0 3 4 8 9 11 12 19 20 25
@@ -47,7 +47,7 @@
#### Invocation with Sentence Splitting
```
echo -n -e ' This ist a start of a text. And this is a sentence!!! But what the hack????\x0a\x03\x0aAnd this is another text.\n\x03\nAnd this a sentence without marker\n' |\
- java -jar target/KorAP-Tokenizer-1.2-SNAPSHOT.jar -s
+ java -jar target/KorAP-Tokenizer-1.3-SNAPSHOT.jar --positions --sentence-boundaries
1 5 6 9 10 11 12 17 18 20 21 22 23 27 27 28 29 32 33 37 38 40 41 42 43 51 51 54 55 58 59 63 64 67 68 72 72 76
1 28 29 54 55 76
0 3 4 8 9 11 12 19 20 24 24 25
diff --git a/pom.xml b/pom.xml
index 4933cf9..9aca3ed 100644
--- a/pom.xml
+++ b/pom.xml
@@ -6,7 +6,7 @@
<groupId>groupId</groupId>
<artifactId>KorAP-Tokenizer</artifactId>
- <version>1.3-${git.commit.id.abbrev}</version>
+ <version>1.3-SNAPSHOT</version>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
@@ -117,7 +117,7 @@
<manifest>
<addClasspath>true</addClasspath>
<classpathPrefix>lib/</classpathPrefix>
- <mainClass>de.ids_mannheim.korap.tokenizer.KorAPTokenizerImpl</mainClass>
+ <mainClass>de.ids_mannheim.korap.tokenizer.KorAPTokenizer</mainClass>
</manifest>
</archive>
</configuration>
@@ -148,7 +148,7 @@
</descriptors>
<archive>
<manifest>
- <mainClass>de.ids_mannheim.korap.tokenizer.KorAPTokenizerImpl</mainClass>
+ <mainClass>de.ids_mannheim.korap.tokenizer.KorAPTokenizer</mainClass>
</manifest>
</archive>
</configuration>
@@ -180,10 +180,42 @@
<generateGitPropertiesFile>false</generateGitPropertiesFile><!-- somehow necessary. otherwise the variables are not available in the pom -->
</configuration>
</plugin>
+ <plugin>
+ <groupId>org.codehaus.mojo</groupId>
+ <artifactId>appassembler-maven-plugin</artifactId>
+ <version>1.10</version>
+ <executions>
+ <execution>
+ <phase>package</phase>
+ <goals>
+ <goal>assemble</goal>
+ </goals>
+ </execution>
+ </executions>
+ <configuration>
+ <binFolder>bin</binFolder>
+ <binFileExtensions>
+ <unix></unix>
+ </binFileExtensions>
+ <programs>
+ <program>
+ <mainClass>de.ids_mannheim.korap.tokenizer.KorAPTokenizer</mainClass>
+ <id>koraptokenizer</id>
+ </program>
+ </programs>
+ </configuration>
+ </plugin>
+
</plugins>
</build>
<dependencies>
+ <dependency>
+ <groupId>info.picocli</groupId>
+ <artifactId>picocli</artifactId>
+ <version>4.2.0</version>
+ </dependency>
+
<!-- https://mvnrepository.com/artifact/org.apache.opennlp/opennlp-tools -->
<dependency>
<groupId>org.apache.opennlp</groupId>
@@ -206,5 +238,11 @@
<version>1.0-1</version>
<scope>test</scope>
</dependency>
+ <!-- https://mvnrepository.com/artifact/org.codehaus.mojo/appassembler-maven-plugin -->
+ <dependency>
+ <groupId>org.codehaus.mojo</groupId>
+ <artifactId>appassembler-maven-plugin</artifactId>
+ <version>2.1.0</version>
+ </dependency>
</dependencies>
</project>
diff --git a/src/assembly/bin-distribution.xml b/src/assembly/bin-distribution.xml
index c830519..ffddc02 100644
--- a/src/assembly/bin-distribution.xml
+++ b/src/assembly/bin-distribution.xml
@@ -2,7 +2,7 @@
<assembly xmlns="http://maven.apache.org/ASSEMBLY/2.1.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/ASSEMBLY/2.1.0 http://maven.apache.org/xsd/assembly-2.1.0.xsd">
- <id>standalone</id>
+ <id>${git.commit.id.abbrev}-standalone</id>
<formats>
<format>jar</format>
</formats>
@@ -17,6 +17,7 @@
<include>opennlp/tools/util/Span.class</include>
<include>opennlp/tools/tokenize/Tokenizer.class</include>
<include>opennlp/tools/sentdetect/SentenceDetector.class</include>
+ <include>picocli/CommandLine*.class</include>
</includes>
</unpackOptions>
<scope>runtime</scope>
diff --git a/src/main/java/de/ids_mannheim/korap/tokenizer/KorAPTokenizer.java b/src/main/java/de/ids_mannheim/korap/tokenizer/KorAPTokenizer.java
new file mode 100644
index 0000000..f5fa9cb
--- /dev/null
+++ b/src/main/java/de/ids_mannheim/korap/tokenizer/KorAPTokenizer.java
@@ -0,0 +1,83 @@
+package de.ids_mannheim.korap.tokenizer;
+
+import picocli.CommandLine;
+
+import java.io.*;
+import java.util.ArrayList;
+import java.util.concurrent.Callable;
+
+@CommandLine.Command(mixinStandardHelpOptions = true,
+ name = "koraptokenizer", version = "{}", description = "Tokenizes (and sentence splits) text input.")
+public class KorAPTokenizer implements Callable<Integer> {
+
+ @CommandLine.Option(names = {"--no-tokens"}, negatable = true, description = "Print tokens (default: ${DEFAULT-VALUE})")
+ boolean tokens = true;
+
+ @CommandLine.Option(names = {"-p", "--positions"}, description = "Print token start and end positions as character offsets (default: ${DEFAULT-VALUE})")
+ boolean positions = false;
+
+ @CommandLine.Option(names = {"-s", "--sentence-boundaries"}, description = "Print sentence boundary positions (default: ${DEFAULT-VALUE})")
+ boolean sentencize = false;
+
+ @CommandLine.Option(names = {"-ktt"}, hidden = true, description = "Deprecated. For internal use only. (default: ${DEFAULT-VALUE})")
+ boolean ktt = false;
+
+ @CommandLine.Option(names = {"-n", "--normalize"}, description = "Normalize tokens (default: ${DEFAULT-VALUE})")
+ boolean normalize = false;
+
+ @SuppressWarnings("CanBeFinal")
+ @CommandLine.Option(names = {"-o",
+ "--output-file"}, paramLabel = "FILE", description = "Output file (default: ${DEFAULT-VALUE})")
+ String output_filename = "-";
+
+ @SuppressWarnings("CanBeFinal")
+ @CommandLine.Option(names = {"--force"}, description = "Force overwrite (default: ${DEFAULT-VALUE})")
+ boolean force_overwrite = false;
+
+
+ @CommandLine.Parameters(arity = "0..*", paramLabel = "FILES", description = "input files")
+ private final ArrayList<String> inputFiles = new ArrayList<>();
+
+ public KorAPTokenizer() {
+
+ }
+
+ public static void main(String[] args) {
+ new CommandLine(new KorAPTokenizer()).execute(args);
+ }
+
+ @Override
+ public Integer call() throws FileNotFoundException {
+ final PrintStream output_stream;
+ if ((output_filename == null) || output_filename.equals("-")) {
+ output_stream = System.out;
+ } else {
+ File f = Utils.createFile(output_filename, force_overwrite);
+ output_stream = new PrintStream(new BufferedOutputStream(new FileOutputStream(f)));
+ }
+
+ for (int i = 0; i < inputFiles.size() || (i == 0 && inputFiles.size() == 0); i++) {
+ KorAPTokenizerImpl scanner = null;
+ String fn = (inputFiles.size() > 0 ? inputFiles.get(i) : "-");
+ try {
+ BufferedReader br = "-".equals(fn) ? new BufferedReader(new InputStreamReader(System.in)) :
+ new BufferedReader(new FileReader(fn));
+ scanner = new KorAPTokenizerImpl(br, output_stream, true, tokens, sentencize, positions, ktt, normalize);
+ scanner.scanThrough();
+ } catch (FileNotFoundException e) {
+ System.err.println("File not found : \"" + fn + "\"");
+ } catch (IOException e) {
+ System.err.println("IO error scanning file \"" + fn + "\"");
+ System.err.println(e);
+ } catch (Exception e) {
+ System.err.println("Unexpected exception:");
+ e.printStackTrace();
+ }
+ }
+ if ((output_filename != null) && !output_filename.equals("-")) {
+ output_stream.close();
+ }
+ return 0;
+ }
+}
+
diff --git a/src/main/java/de/ids_mannheim/korap/tokenizer/Utils.java b/src/main/java/de/ids_mannheim/korap/tokenizer/Utils.java
new file mode 100644
index 0000000..6df7f00
--- /dev/null
+++ b/src/main/java/de/ids_mannheim/korap/tokenizer/Utils.java
@@ -0,0 +1,29 @@
+package de.ids_mannheim.korap.tokenizer;
+
+import java.io.File;
+import java.io.IOException;
+import java.nio.file.AccessDeniedException;
+import java.nio.file.FileAlreadyExistsException;
+import java.nio.file.Files;
+
+public class Utils {
+ public static File createFile(String fname, boolean force_overwrite) {
+ File f = new File(fname);
+ try {
+ Files.createFile(f.toPath());
+ } catch (AccessDeniedException e) {
+ final String message = "ERROR: Cannot write file '" + fname + "'";
+ System.err.println(message);
+ System.exit(-1);
+ } catch (FileAlreadyExistsException e) {
+ if (!force_overwrite) {
+ final String message = "ERROR: '" + fname + "' already exits. Use --force to overwrite";
+ System.err.println(message);
+ System.exit(-1);
+ }
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ return f;
+ }
+}
diff --git a/src/main/jflex/de/ids_mannheim/korap/tokenizer/KorAPTokenizerImpl.jflex b/src/main/jflex/de/ids_mannheim/korap/tokenizer/KorAPTokenizerImpl.jflex
index 79439c6..85bcadc 100644
--- a/src/main/jflex/de/ids_mannheim/korap/tokenizer/KorAPTokenizerImpl.jflex
+++ b/src/main/jflex/de/ids_mannheim/korap/tokenizer/KorAPTokenizerImpl.jflex
@@ -49,10 +49,7 @@
See the License for the specific language governing permissions and
limitations under the License.
*/
-
-import java.io.BufferedReader;
-import java.io.StringReader;
-import java.io.InputStreamReader;
+import java.io.*;
import java.lang.StringBuffer;
import java.util.ArrayList;
import java.util.List;
@@ -73,252 +70,230 @@
%char
%{
+ private boolean xmlEcho = false;
+ private boolean normalize = false;
+ private boolean debug = false;
+ private boolean newSentence = true;
+ private long startOffset = 0;
+ private long previousFileEndOffset = -1;
+ private int tokenId = 0;
+ private boolean atEOT = false;
+ private boolean sentencize = false;
+ private boolean echo = false;
+ private boolean positions = false;
+ private boolean tokens = false;
+ private PrintStream outputStream = System.out;
- public boolean xmlEcho = false;
- public boolean sentences = false;
- public boolean normalize = false;
- public boolean debug = false;
- private boolean newSentence = true;
- private long startOffset = 0;
- private long previousFileEndOffset = -1;
- private int tokenId = 0;
- private StringBuffer bounds = null;
- private long fallbackSentenceEndOffset = -1;
- private StringBuffer sentenceBounds = null;
+ public KorAPTokenizerImpl() {
+ this.zzReader = null;
+ }
- public KorAPTokenizerImpl() {
- this.zzReader = null;
- sentenceBounds = null;
- }
+ public KorAPTokenizerImpl(java.io.Reader in, PrintStream outputStream, boolean echo, boolean tokens, boolean sentencize, boolean positions, boolean xmlEcho, boolean normalize) {
+ this.zzReader = in;
+ if (outputStream != null)
+ this.outputStream = outputStream;
+ this.tokens = tokens;
+ this.sentencize = sentencize;
+ this.positions = positions;
+ this.echo = echo;
+ this.xmlEcho = xmlEcho;
+ this.normalize = normalize;
+ }
- public String[] tokenize(String s) {
- Span[] spans;
- int i;
- String[] tokens;
-
- spans = tokenizePos(s);
- tokens = new String[spans.length];
- for(i=0; i<spans.length; i++) {
- tokens[i]=spans[i].getType();
- }
- return tokens;
- }
-
- public Span[] tokenizePos(String s) {
- Span token;
- int i=0;
- List<Span> list = new ArrayList<Span>();
- tokenId=0;
- yyreset(new StringReader(s));
- try {
- while(!this.zzAtEOF) {
- token = this.getNextToken();
- if(token != null) {
- list.add(token);
- }
- }
- } catch (java.io.IOException e) {
- System.out.println("IO error scanning "+s);
- System.out.println(e);
- }
- return(list.toArray(new Span[list.size()]));
- }
-
- public String[] sentDetect(String s) {
- Span[] spans;
- int i;
- String[] sentences;
-
- spans = sentPosDetect(s);
- sentences = new String[spans.length];
- for (i = 0; i < spans.length; i++) {
- sentences[i] = spans[i].getType();
- }
- return sentences;
- }
-
- public Span[] sentPosDetect(String s) {
- final Span tokens[] = tokenizePos(s);
- ArrayList<Span> sentences = new ArrayList<Span>();
- int sentenceStart = 0;
- if (tokens.length > 0)
- tokens[0].getStart();
- for (int i = 0; i < tokens.length; i++) {
- if (tokens[i].getType().matches("^[.?!]+$") || i == tokens.length - 1) {
- sentences.add(new Span(sentenceStart, tokens[i].getEnd(), s.substring(sentenceStart, tokens[i].getEnd())));
- if (i < tokens.length - 1) {
- sentenceStart = tokens[i + 1].getStart();
+ public void scanThrough() throws IOException {
+ List<Span> list = new ArrayList<Span>();
+ Span token;
+ while (!zzAtEOF) {
+ token = this.getNextToken();
+ if (atEOT) {
+ if (echo) {
+ printTokenPositions(list, sentencize);
+ list.clear();
+ }
+ atEOT = false;
+ }
+ if (token != null) {
+ list.add(token);
}
}
}
- return sentences.toArray(new Span[0]);
- }
- public int[] tokenizeMilestones(String s) {
- Span[] spans;
- int i;
- int[] milestones;
+ public String[] tokenize(String s) {
+ Span[] spans;
+ int i;
+ String[] tokens;
- spans = tokenizePos(s);
- milestones = new int[2*spans.length];
- for(i=0; i<spans.length; i++) {
- milestones[i*2]=spans[i].getStart();
- milestones[i*2+1]=spans[i].getEnd();
- }
- return milestones;
- }
+ spans = tokenizePos(s);
+ tokens = new String[spans.length];
+ for (i = 0; i < spans.length; i++) {
+ tokens[i] = spans[i].getType();
+ }
+ return tokens;
+ }
- public final long yychar() {
- return yychar;
- }
+ public void printTokenPositions(List<Span> spanList, boolean sentencize) {
+ int sentenceStart = -1;
+ StringBuilder tokenStringBuffer = new StringBuilder();
+ StringBuilder sentenceStringBuffer = new StringBuilder();
+ for (int i = 0; i < spanList.size(); i++) {
+ Span s = spanList.get(i);
+ if (sentenceStart == -1)
+ sentenceStart = s.getStart();
+ if (positions) {
+ tokenStringBuffer.append(s.getStart())
+ .append(" ")
+ .append(s.getEnd());
+ if (i < spanList.size() - 1)
+ tokenStringBuffer.append(" ");
+ }
+ if (isSentenceBound(s.getType()) || (i == spanList.size() - 1)) {
+ sentenceStringBuffer.append(sentenceStart)
+ .append(" ")
+ .append(s.getEnd());
+ sentenceStart = -1;
+ if (i < spanList.size() - 1)
+ sentenceStringBuffer.append(" ");
+ }
+ }
+ outputStream.println(tokenStringBuffer.toString());
+ if (sentencize)
+ outputStream.println(sentenceStringBuffer.toString());
+ }
- final Span currentToken() {
- return currentToken(yytext());
- }
+ public Span[] tokenizePos(String s) {
+ Span token;
+ int i = 0;
+ List<Span> list = new ArrayList<Span>();
+ tokenId = 0;
+ yyreset(new StringReader(s));
+ try {
+ while (!this.zzAtEOF) {
+ token = this.getNextToken();
+ if (atEOT) {
+ if (echo) {
+ printTokenPositions(list, sentencize);
+ list.clear();
+ }
+ atEOT = false;
+ }
+ if (token != null) {
+ list.add(token);
+ }
+ }
+ } catch (java.io.IOException e) {
+ System.err.println("IO error scanning " + s);
+ System.err.println(e);
+ }
+ return (list.toArray(new Span[list.size()]));
+ }
- public boolean isSentenceBound(String s) {
+ public String[] sentDetect(String s) {
+ Span[] spans;
+ int i;
+ String[] sentences;
+
+ spans = sentPosDetect(s);
+ sentences = new String[spans.length];
+ for (i = 0; i < spans.length; i++) {
+ sentences[i] = spans[i].getType();
+ }
+ return sentences;
+ }
+
+ public Span[] sentPosDetect(String s) {
+ final Span tokens[] = tokenizePos(s);
+ ArrayList<Span> sentences = new ArrayList<Span>();
+ int sentenceStart = 0;
+ if (tokens.length > 0)
+ tokens[0].getStart();
+ for (int i = 0; i < tokens.length; i++) {
+ if (tokens[i].getType().matches("^[.?!]+$") || i == tokens.length - 1) {
+ sentences.add(new Span(sentenceStart, tokens[i].getEnd(), s.substring(sentenceStart, tokens[i].getEnd())));
+ if (i < tokens.length - 1) {
+ sentenceStart = tokens[i + 1].getStart();
+ }
+ }
+ }
+ return sentences.toArray(new Span[0]);
+ }
+
+ public final long yychar() {
+ return yychar;
+ }
+
+ final Span currentToken() {
+ return currentToken(yytext());
+ }
+
+ public boolean isSentenceBound(String s) {
return s.matches("^[.?!]+$");
}
- final Span currentToken(String normalizedValue) {
- String value;
- long lengthDiff=0;
- previousFileEndOffset = -1;
+ final Span currentToken(String normalizedValue) {
+ String value;
+ long lengthDiff = 0;
+ previousFileEndOffset = -1;
- if(normalize) {
- value = normalizedValue;
- } else {
- value = yytext();
- lengthDiff = value.length() - value.codePointCount(0, value.length());
- }
- if(startOffset > yychar || startOffset < 0) { // how can this happen?
- startOffset = 0;
- }
- long from = (yychar-startOffset),
- to = (yychar-startOffset+yylength()-lengthDiff);
- if(xmlEcho) {
- System.out.println("<span id=\"t_"+tokenId+"\" from=\""+from+"\" to=\"" + to + "\"/>\n"+value);
- }
- startOffset += lengthDiff;
- tokenId++;
- if(bounds != null) {
- if(debug) {
- System.err.println(from+"-"+to+":"+ value);
- }
- bounds.append(from+" "+to+" ");
- if (sentences) {
- if (newSentence || sentenceBounds.length() == 0) {
- if (sentenceBounds.length() != 0)
- sentenceBounds.append(" ");
- sentenceBounds.append(from);
- newSentence = false;
- }
- if (isSentenceBound(value)) {
- sentenceBounds.append(" " + to);
- fallbackSentenceEndOffset = -1;
- newSentence = true;
- } else {
- fallbackSentenceEndOffset = to;
- }
+ if (normalize) {
+ value = normalizedValue;
+ } else {
+ value = yytext();
+ lengthDiff = value.length() - value.codePointCount(0, value.length());
+ }
+ if (startOffset > yychar || startOffset < 0) { // how can this happen?
+ startOffset = 0;
+ }
+ long from = (yychar - startOffset),
+ to = (yychar - startOffset + yylength() - lengthDiff);
+ if (xmlEcho) {
+ outputStream.println("<span id=\"t_" + tokenId + "\" from=\"" + from + "\" to=\"" + to + "\"/>\n" + value);
+ } else if (echo && tokens) {
+ outputStream.println(value);
+ }
+ startOffset += lengthDiff;
+ tokenId++;
+ return new Span((int) from, (int) to, value);
+ }
+
+ final void fileEnd() {
+ startOffset = yychar + yylength();
+ // do not end a file multiple times because of additional EOT characters
+ if (startOffset == previousFileEndOffset)
+ return;
+ atEOT = true;
+ previousFileEndOffset = startOffset;
+ tokenId = 0;
+ }
+
+ final Span xmlPassage() {
+ if (xmlEcho) {
+ String dings = yytext();
+ if (dings.indexOf("<text") >= 0) {
+ startOffset = yychar + yylength();
+ tokenId = 0;
}
+ outputStream.println(dings.replaceAll("[\n\r]+", ""));
+ return null;
+ } else {
+ return currentToken();
}
- return new Span((int)from, (int)to, value);
- }
+ }
- final void fileEnd() {
- startOffset = yychar+yylength();
- // do not end a file multiple times because of additional EOT characters
- if (startOffset == previousFileEndOffset)
- return;;
- previousFileEndOffset = startOffset;
- tokenId=0;
- if(bounds != null && !xmlEcho) {
- System.out.println(bounds.toString().trim());
- if (sentences && sentenceBounds != null) {
- if (fallbackSentenceEndOffset != -1 && bounds.toString().trim().length() != 0)
- sentenceBounds.append(" "+fallbackSentenceEndOffset);
- System.out.println(sentenceBounds.toString());
- }
- bounds.setLength(0);
- sentenceBounds.setLength(0);
- }
- }
+ final void zipArchive() {
+ String name;
+ String matched = yytext();
+ int start = 10;
+ name = matched.substring(start, matched.length() - 1);
+ outputStream.println("<archive name=\"" + name + "\"/>");
+ }
- final Span xmlPassage() {
- if(xmlEcho) {
- String dings = yytext();
- if(dings.indexOf("<text")>=0 ) {
- startOffset = yychar+yylength();
- tokenId=0;
- }
- System.out.println(dings.replaceAll("[\n\r]+",""));
- return null;
- } else {
- return currentToken();
- }
- }
-
- final void zipArchive() {
- String name;
- String matched = yytext();
- int start = 10;
- name = matched.substring(start, matched.length() - 1);
- System.out.println("<archive name=\"" + name + "\"/>");
- }
-
- final void zippedFile() {
- String name;
- String matched = yytext();
- int start = 13;
- name = matched.substring(start, matched.length() - 3);
- System.out.println("<file name=\"" + name + "\"/>");
- }
-
- public static void main(String argv[]) {
- int args=argv.length;
- int j=0;
- boolean xmlout = false;
- boolean normalize = false;
- boolean sentences = false;
-
- for (int i = 0; i < argv.length && argv[i].indexOf("-") == 0; i++) {
- if(argv[i].equals("-ktt")) { // act as a tokenizer for KorAP TreeTagger
- xmlout=true;
- } else if(argv[i].equals("-n")) { // do some normailization
- normalize=true;
- } else if(argv[i].equals("-s")) { // detect sentence boundaries
- sentences=true;
- }
- j++;
- }
-
- for (int i = j; i < argv.length || (i == j && argv.length == j); i++) {
- KorAPTokenizerImpl scanner = null;
- String fn = (argv.length > j ? argv[i] : "-");
- try {
- BufferedReader br = "-".equals(fn) ? new BufferedReader(new InputStreamReader(System.in)) :
- new BufferedReader(new java.io.FileReader(fn));
- scanner = new KorAPTokenizerImpl(br);
- scanner.bounds = new StringBuffer(1280000);
- scanner.sentenceBounds = new StringBuffer(128000);
- scanner.xmlEcho=xmlout;
- scanner.normalize=normalize;
- scanner.sentences=sentences;
- while ( !scanner.zzAtEOF ) { scanner.getNextToken(); }
- }
- catch (java.io.FileNotFoundException e) {
- System.out.println("File not found : \""+fn+"\"");
- }
- catch (java.io.IOException e) {
- System.out.println("IO error scanning file \""+fn+"\"");
- System.out.println(e);
- }
- catch (Exception e) {
- System.out.println("Unexpected exception:");
- e.printStackTrace();
- }
- }
- }
+ final void zippedFile() {
+ String name;
+ String matched = yytext();
+ int start = 13;
+ name = matched.substring(start, matched.length() - 3);
+ outputStream.println("<file name=\"" + name + "\"/>");
+ }
%}
THAI = [\u0E00-\u0E59]
diff --git a/src/test/java/de/ids_mannheim/korap/tokenizer/IPCOffsetTests.java b/src/test/java/de/ids_mannheim/korap/tokenizer/IPCOffsetTests.java
index a4826dd..bf5743c 100644
--- a/src/test/java/de/ids_mannheim/korap/tokenizer/IPCOffsetTests.java
+++ b/src/test/java/de/ids_mannheim/korap/tokenizer/IPCOffsetTests.java
@@ -1,6 +1,5 @@
package de.ids_mannheim.korap.tokenizer;
-import org.apache.maven.surefire.shade.org.apache.commons.io.output.ByteArrayOutputStream;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.junit.runners.Parameterized;
@@ -16,7 +15,6 @@
import static org.junit.Assert.*;
@RunWith(Parameterized.class)
-@net.jcip.annotations.NotThreadSafe
public class IPCOffsetTests {
@Parameterized.Parameters
public static Collection<Object[]> data() {
@@ -47,12 +45,12 @@
@Test
public void testMainWithOffsetsAndSentencesOnDifferentInputFiles() throws IOException {
- final ByteArrayOutputStream myOut = new ByteArrayOutputStream();
- System.setOut(new PrintStream(myOut));
- String[] args = {"-s", input};
- KorAPTokenizerImpl.main(args);
+ File tempFile = File.createTempFile("tokenoutput", ".txt");
+ String[] args = {"--no-tokens", "--positions", "--sentence-boundaries", "--force", "-o", tempFile.getAbsolutePath(), input};
+ KorAPTokenizer.main(args);
+ String actualResult = readFile(tempFile.getAbsolutePath());
String goldData = readFile(gold);
- assertEquals(goldData, myOut.toString(StandardCharsets.UTF_8));
+ assertEquals(goldData, actualResult);
}
}
diff --git a/src/test/java/de/ids_mannheim/korap/tokenizer/TokenizerTest.java b/src/test/java/de/ids_mannheim/korap/tokenizer/TokenizerTest.java
index f7811c0..fe694be 100644
--- a/src/test/java/de/ids_mannheim/korap/tokenizer/TokenizerTest.java
+++ b/src/test/java/de/ids_mannheim/korap/tokenizer/TokenizerTest.java
@@ -2,6 +2,7 @@
import static org.junit.Assert.*;
+import opennlp.tools.util.Span;
import org.apache.maven.surefire.shade.org.apache.commons.io.output.ByteArrayOutputStream;
import org.junit.Test;
import org.junit.Ignore;
@@ -495,4 +496,12 @@
String[] tokens = tok.tokenize("Archive: ich/bin/ein.zip\n");
assertEquals(0, tokens.length);
}
+
+ @Test
+ public void testTextBreakOutputArchive () {
+ KorAPTokenizerImpl tok = new KorAPTokenizerImpl(null, null, false, false, false, true, false, false);
+ Span[] tokens = tok.tokenizePos("Text1\004\nText2 Hallo\004Rumsdibums\004Das freut mich sehr.\n");
+ assertEquals("Text1", tokens[0].getType());
+ assertEquals(tokens.length, 9 );
+ }
}
diff --git a/target/generated-sources/jflex/de/ids_mannheim/korap/tokenizer/KorAPTokenizerImpl.java b/target/generated-sources/jflex/de/ids_mannheim/korap/tokenizer/KorAPTokenizerImpl.java
index 07a6e08..0c47090 100644
--- a/target/generated-sources/jflex/de/ids_mannheim/korap/tokenizer/KorAPTokenizerImpl.java
+++ b/target/generated-sources/jflex/de/ids_mannheim/korap/tokenizer/KorAPTokenizerImpl.java
@@ -53,10 +53,7 @@
See the License for the specific language governing permissions and
limitations under the License.
*/
-
-import java.io.BufferedReader;
-import java.io.StringReader;
-import java.io.InputStreamReader;
+import java.io.*;
import java.lang.StringBuffer;
import java.util.ArrayList;
import java.util.List;
@@ -34140,252 +34137,230 @@
private boolean zzEOFDone;
/* user code: */
+ private boolean xmlEcho = false;
+ private boolean normalize = false;
+ private boolean debug = false;
+ private boolean newSentence = true;
+ private long startOffset = 0;
+ private long previousFileEndOffset = -1;
+ private int tokenId = 0;
+ private boolean atEOT = false;
+ private boolean sentencize = false;
+ private boolean echo = false;
+ private boolean positions = false;
+ private boolean tokens = false;
+ private PrintStream outputStream = System.out;
- public boolean xmlEcho = false;
- public boolean sentences = false;
- public boolean normalize = false;
- public boolean debug = false;
- private boolean newSentence = true;
- private long startOffset = 0;
- private long previousFileEndOffset = -1;
- private int tokenId = 0;
- private StringBuffer bounds = null;
- private long fallbackSentenceEndOffset = -1;
- private StringBuffer sentenceBounds = null;
+ public KorAPTokenizerImpl() {
+ this.zzReader = null;
+ }
- public KorAPTokenizerImpl() {
- this.zzReader = null;
- sentenceBounds = null;
- }
+ public KorAPTokenizerImpl(java.io.Reader in, PrintStream outputStream, boolean echo, boolean tokens, boolean sentencize, boolean positions, boolean xmlEcho, boolean normalize) {
+ this.zzReader = in;
+ if (outputStream != null)
+ this.outputStream = outputStream;
+ this.tokens = tokens;
+ this.sentencize = sentencize;
+ this.positions = positions;
+ this.echo = echo;
+ this.xmlEcho = xmlEcho;
+ this.normalize = normalize;
+ }
- public String[] tokenize(String s) {
- Span[] spans;
- int i;
- String[] tokens;
-
- spans = tokenizePos(s);
- tokens = new String[spans.length];
- for(i=0; i<spans.length; i++) {
- tokens[i]=spans[i].getType();
- }
- return tokens;
- }
-
- public Span[] tokenizePos(String s) {
- Span token;
- int i=0;
- List<Span> list = new ArrayList<Span>();
- tokenId=0;
- yyreset(new StringReader(s));
- try {
- while(!this.zzAtEOF) {
- token = this.getNextToken();
- if(token != null) {
- list.add(token);
- }
- }
- } catch (java.io.IOException e) {
- System.out.println("IO error scanning "+s);
- System.out.println(e);
- }
- return(list.toArray(new Span[list.size()]));
- }
-
- public String[] sentDetect(String s) {
- Span[] spans;
- int i;
- String[] sentences;
-
- spans = sentPosDetect(s);
- sentences = new String[spans.length];
- for (i = 0; i < spans.length; i++) {
- sentences[i] = spans[i].getType();
- }
- return sentences;
- }
-
- public Span[] sentPosDetect(String s) {
- final Span tokens[] = tokenizePos(s);
- ArrayList<Span> sentences = new ArrayList<Span>();
- int sentenceStart = 0;
- if (tokens.length > 0)
- tokens[0].getStart();
- for (int i = 0; i < tokens.length; i++) {
- if (tokens[i].getType().matches("^[.?!]+$") || i == tokens.length - 1) {
- sentences.add(new Span(sentenceStart, tokens[i].getEnd(), s.substring(sentenceStart, tokens[i].getEnd())));
- if (i < tokens.length - 1) {
- sentenceStart = tokens[i + 1].getStart();
+ public void scanThrough() throws IOException {
+ List<Span> list = new ArrayList<Span>();
+ Span token;
+ while (!zzAtEOF) {
+ token = this.getNextToken();
+ if (atEOT) {
+ if (echo) {
+ printTokenPositions(list, sentencize);
+ list.clear();
+ }
+ atEOT = false;
+ }
+ if (token != null) {
+ list.add(token);
}
}
}
- return sentences.toArray(new Span[0]);
- }
- public int[] tokenizeMilestones(String s) {
- Span[] spans;
- int i;
- int[] milestones;
+ public String[] tokenize(String s) {
+ Span[] spans;
+ int i;
+ String[] tokens;
- spans = tokenizePos(s);
- milestones = new int[2*spans.length];
- for(i=0; i<spans.length; i++) {
- milestones[i*2]=spans[i].getStart();
- milestones[i*2+1]=spans[i].getEnd();
- }
- return milestones;
- }
+ spans = tokenizePos(s);
+ tokens = new String[spans.length];
+ for (i = 0; i < spans.length; i++) {
+ tokens[i] = spans[i].getType();
+ }
+ return tokens;
+ }
- public final long yychar() {
- return yychar;
- }
+ public void printTokenPositions(List<Span> spanList, boolean sentencize) {
+ int sentenceStart = -1;
+ StringBuilder tokenStringBuffer = new StringBuilder();
+ StringBuilder sentenceStringBuffer = new StringBuilder();
+ for (int i = 0; i < spanList.size(); i++) {
+ Span s = spanList.get(i);
+ if (sentenceStart == -1)
+ sentenceStart = s.getStart();
+ if (positions) {
+ tokenStringBuffer.append(s.getStart())
+ .append(" ")
+ .append(s.getEnd());
+ if (i < spanList.size() - 1)
+ tokenStringBuffer.append(" ");
+ }
+ if (isSentenceBound(s.getType()) || (i == spanList.size() - 1)) {
+ sentenceStringBuffer.append(sentenceStart)
+ .append(" ")
+ .append(s.getEnd());
+ sentenceStart = -1;
+ if (i < spanList.size() - 1)
+ sentenceStringBuffer.append(" ");
+ }
+ }
+ outputStream.println(tokenStringBuffer.toString());
+ if (sentencize)
+ outputStream.println(sentenceStringBuffer.toString());
+ }
- final Span currentToken() {
- return currentToken(yytext());
- }
+ public Span[] tokenizePos(String s) {
+ Span token;
+ int i = 0;
+ List<Span> list = new ArrayList<Span>();
+ tokenId = 0;
+ yyreset(new StringReader(s));
+ try {
+ while (!this.zzAtEOF) {
+ token = this.getNextToken();
+ if (atEOT) {
+ if (echo) {
+ printTokenPositions(list, sentencize);
+ list.clear();
+ }
+ atEOT = false;
+ }
+ if (token != null) {
+ list.add(token);
+ }
+ }
+ } catch (java.io.IOException e) {
+ System.err.println("IO error scanning " + s);
+ System.err.println(e);
+ }
+ return (list.toArray(new Span[list.size()]));
+ }
- public boolean isSentenceBound(String s) {
+ public String[] sentDetect(String s) {
+ Span[] spans;
+ int i;
+ String[] sentences;
+
+ spans = sentPosDetect(s);
+ sentences = new String[spans.length];
+ for (i = 0; i < spans.length; i++) {
+ sentences[i] = spans[i].getType();
+ }
+ return sentences;
+ }
+
+ public Span[] sentPosDetect(String s) {
+ final Span tokens[] = tokenizePos(s);
+ ArrayList<Span> sentences = new ArrayList<Span>();
+ int sentenceStart = 0;
+ if (tokens.length > 0)
+ tokens[0].getStart();
+ for (int i = 0; i < tokens.length; i++) {
+ if (tokens[i].getType().matches("^[.?!]+$") || i == tokens.length - 1) {
+ sentences.add(new Span(sentenceStart, tokens[i].getEnd(), s.substring(sentenceStart, tokens[i].getEnd())));
+ if (i < tokens.length - 1) {
+ sentenceStart = tokens[i + 1].getStart();
+ }
+ }
+ }
+ return sentences.toArray(new Span[0]);
+ }
+
+ public final long yychar() {
+ return yychar;
+ }
+
+ final Span currentToken() {
+ return currentToken(yytext());
+ }
+
+ public boolean isSentenceBound(String s) {
return s.matches("^[.?!]+$");
}
- final Span currentToken(String normalizedValue) {
- String value;
- long lengthDiff=0;
- previousFileEndOffset = -1;
+ final Span currentToken(String normalizedValue) {
+ String value;
+ long lengthDiff = 0;
+ previousFileEndOffset = -1;
- if(normalize) {
- value = normalizedValue;
- } else {
- value = yytext();
- lengthDiff = value.length() - value.codePointCount(0, value.length());
- }
- if(startOffset > yychar || startOffset < 0) { // how can this happen?
- startOffset = 0;
- }
- long from = (yychar-startOffset),
- to = (yychar-startOffset+yylength()-lengthDiff);
- if(xmlEcho) {
- System.out.println("<span id=\"t_"+tokenId+"\" from=\""+from+"\" to=\"" + to + "\"/>\n"+value);
- }
- startOffset += lengthDiff;
- tokenId++;
- if(bounds != null) {
- if(debug) {
- System.err.println(from+"-"+to+":"+ value);
- }
- bounds.append(from+" "+to+" ");
- if (sentences) {
- if (newSentence || sentenceBounds.length() == 0) {
- if (sentenceBounds.length() != 0)
- sentenceBounds.append(" ");
- sentenceBounds.append(from);
- newSentence = false;
- }
- if (isSentenceBound(value)) {
- sentenceBounds.append(" " + to);
- fallbackSentenceEndOffset = -1;
- newSentence = true;
- } else {
- fallbackSentenceEndOffset = to;
- }
+ if (normalize) {
+ value = normalizedValue;
+ } else {
+ value = yytext();
+ lengthDiff = value.length() - value.codePointCount(0, value.length());
+ }
+ if (startOffset > yychar || startOffset < 0) { // how can this happen?
+ startOffset = 0;
+ }
+ long from = (yychar - startOffset),
+ to = (yychar - startOffset + yylength() - lengthDiff);
+ if (xmlEcho) {
+ outputStream.println("<span id=\"t_" + tokenId + "\" from=\"" + from + "\" to=\"" + to + "\"/>\n" + value);
+ } else if (echo && tokens) {
+ outputStream.println(value);
+ }
+ startOffset += lengthDiff;
+ tokenId++;
+ return new Span((int) from, (int) to, value);
+ }
+
+ final void fileEnd() {
+ startOffset = yychar + yylength();
+ // do not end a file multiple times because of additional EOT characters
+ if (startOffset == previousFileEndOffset)
+ return;
+ atEOT = true;
+ previousFileEndOffset = startOffset;
+ tokenId = 0;
+ }
+
+ final Span xmlPassage() {
+ if (xmlEcho) {
+ String dings = yytext();
+ if (dings.indexOf("<text") >= 0) {
+ startOffset = yychar + yylength();
+ tokenId = 0;
}
+ outputStream.println(dings.replaceAll("[\n\r]+", ""));
+ return null;
+ } else {
+ return currentToken();
}
- return new Span((int)from, (int)to, value);
- }
+ }
- final void fileEnd() {
- startOffset = yychar+yylength();
- // do not end a file multiple times because of additional EOT characters
- if (startOffset == previousFileEndOffset)
- return;;
- previousFileEndOffset = startOffset;
- tokenId=0;
- if(bounds != null && !xmlEcho) {
- System.out.println(bounds.toString().trim());
- if (sentences && sentenceBounds != null) {
- if (fallbackSentenceEndOffset != -1 && bounds.toString().trim().length() != 0)
- sentenceBounds.append(" "+fallbackSentenceEndOffset);
- System.out.println(sentenceBounds.toString());
- }
- bounds.setLength(0);
- sentenceBounds.setLength(0);
- }
- }
+ final void zipArchive() {
+ String name;
+ String matched = yytext();
+ int start = 10;
+ name = matched.substring(start, matched.length() - 1);
+ outputStream.println("<archive name=\"" + name + "\"/>");
+ }
- final Span xmlPassage() {
- if(xmlEcho) {
- String dings = yytext();
- if(dings.indexOf("<text")>=0 ) {
- startOffset = yychar+yylength();
- tokenId=0;
- }
- System.out.println(dings.replaceAll("[\n\r]+",""));
- return null;
- } else {
- return currentToken();
- }
- }
-
- final void zipArchive() {
- String name;
- String matched = yytext();
- int start = 10;
- name = matched.substring(start, matched.length() - 1);
- System.out.println("<archive name=\"" + name + "\"/>");
- }
-
- final void zippedFile() {
- String name;
- String matched = yytext();
- int start = 13;
- name = matched.substring(start, matched.length() - 3);
- System.out.println("<file name=\"" + name + "\"/>");
- }
-
- public static void main(String argv[]) {
- int args=argv.length;
- int j=0;
- boolean xmlout = false;
- boolean normalize = false;
- boolean sentences = false;
-
- for (int i = 0; i < argv.length && argv[i].indexOf("-") == 0; i++) {
- if(argv[i].equals("-ktt")) { // act as a tokenizer for KorAP TreeTagger
- xmlout=true;
- } else if(argv[i].equals("-n")) { // do some normailization
- normalize=true;
- } else if(argv[i].equals("-s")) { // detect sentence boundaries
- sentences=true;
- }
- j++;
- }
-
- for (int i = j; i < argv.length || (i == j && argv.length == j); i++) {
- KorAPTokenizerImpl scanner = null;
- String fn = (argv.length > j ? argv[i] : "-");
- try {
- BufferedReader br = "-".equals(fn) ? new BufferedReader(new InputStreamReader(System.in)) :
- new BufferedReader(new java.io.FileReader(fn));
- scanner = new KorAPTokenizerImpl(br);
- scanner.bounds = new StringBuffer(1280000);
- scanner.sentenceBounds = new StringBuffer(128000);
- scanner.xmlEcho=xmlout;
- scanner.normalize=normalize;
- scanner.sentences=sentences;
- while ( !scanner.zzAtEOF ) { scanner.getNextToken(); }
- }
- catch (java.io.FileNotFoundException e) {
- System.out.println("File not found : \""+fn+"\"");
- }
- catch (java.io.IOException e) {
- System.out.println("IO error scanning file \""+fn+"\"");
- System.out.println(e);
- }
- catch (Exception e) {
- System.out.println("Unexpected exception:");
- e.printStackTrace();
- }
- }
- }
+ final void zippedFile() {
+ String name;
+ String matched = yytext();
+ int start = 13;
+ name = matched.substring(start, matched.length() - 3);
+ outputStream.println("<file name=\"" + name + "\"/>");
+ }
/**