List available KorapTokenizerImplementation options for -T
Change-Id: I8e45410f67decb6b44fc9574dfe94c89b7abd0cc
diff --git a/pom.xml b/pom.xml
index b2be2d2..104c1e7 100644
--- a/pom.xml
+++ b/pom.xml
@@ -100,7 +100,7 @@
<path>
<groupId>info.picocli</groupId>
<artifactId>picocli-codegen</artifactId>
- <version>4.2.0</version>
+ <version>4.5.0</version>
</path>
</annotationProcessorPaths>
<compilerArgs>
@@ -243,7 +243,7 @@
<dependency>
<groupId>info.picocli</groupId>
<artifactId>picocli</artifactId>
- <version>4.2.0</version>
+ <version>4.5.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.opennlp/opennlp-tools -->
@@ -286,5 +286,10 @@
<artifactId>jcp</artifactId>
<version>7.0.4</version>
</dependency>
+ <dependency>
+ <groupId>io.github.classgraph</groupId>
+ <artifactId>classgraph</artifactId>
+ <version>4.8.90</version>
+ </dependency>
</dependencies>
</project>
diff --git a/src/assembly/bin-distribution.xml b/src/assembly/bin-distribution.xml
index d213453..8e8ed99 100644
--- a/src/assembly/bin-distribution.xml
+++ b/src/assembly/bin-distribution.xml
@@ -18,6 +18,8 @@
<include>opennlp/tools/tokenize/Tokenizer.class</include>
<include>opennlp/tools/sentdetect/SentenceDetector.class</include>
<include>picocli/CommandLine*.class</include>
+ <include>io/github/classgraph/*.class</include>
+ <include>nonapi/io/github/classgraph/**/*.class</include>
</includes>
</unpackOptions>
<scope>runtime</scope>
diff --git a/src/main/java/de/ids_mannheim/korap/tokenizer/Main.java b/src/main/java/de/ids_mannheim/korap/tokenizer/Main.java
index 82c6884..26f48f7 100644
--- a/src/main/java/de/ids_mannheim/korap/tokenizer/Main.java
+++ b/src/main/java/de/ids_mannheim/korap/tokenizer/Main.java
@@ -1,5 +1,8 @@
package de.ids_mannheim.korap.tokenizer;
+import io.github.classgraph.ClassGraph;
+import io.github.classgraph.ClassInfoList;
+import io.github.classgraph.ScanResult;
import picocli.CommandLine;
import java.io.*;
@@ -7,13 +10,33 @@
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.util.ArrayList;
+import java.util.List;
import java.util.concurrent.Callable;
+import java.util.stream.Collectors;
@CommandLine.Command(mixinStandardHelpOptions = true,
name = "koraptokenizer", version = "{}", description = "Tokenizes (and sentence splits) text input.")
public class Main implements Callable<Integer> {
- @CommandLine.Option(names = {"-T", "--tokenizer-class"}, description = "Class name of the actual tokenizer that will be used (default: ${DEFAULT-VALUE})")
+ static class AvailableKorapTokenizerList extends ArrayList<String> {
+ AvailableKorapTokenizerList() {
+ super(listKorAPTokenizerImplementations());
+ }
+
+ static List<String> listKorAPTokenizerImplementations() {
+ List<String> korapTokenizerClassNames;
+ try (ScanResult scanResult = new ClassGraph().enableAllInfo().acceptPackages("*")
+ .scan()) {
+ ClassInfoList korapTokenizerClasses = scanResult.getClassesImplementing("de.ids_mannheim.korap.tokenizer.KorapTokenizer");
+ korapTokenizerClassNames = korapTokenizerClasses.getNames();
+ }
+ return korapTokenizerClassNames;
+ }
+ }
+
+ @CommandLine.Option(names = {"-T", "--tokenizer-class"},
+ completionCandidates= AvailableKorapTokenizerList.class,
+ description = "Class name of the actual tokenizer that will be used (candidates: ${COMPLETION-CANDIDATES} default: ${DEFAULT-VALUE})")
String tokenizerClassName = DerekoDfaTokenizer_de.class.getName();
@CommandLine.Option(names = {"--no-tokens"}, negatable = true, description = "Print tokens (default: ${DEFAULT-VALUE})")