Add -l command line option to choose language
Change-Id: I48e1ce119dfbd75ff9f719ce8525a2c63ad47fc5
diff --git a/src/main/java/de/ids_mannheim/korap/tokenizer/Languages.java b/src/main/java/de/ids_mannheim/korap/tokenizer/Languages.java
new file mode 100644
index 0000000..623edc8
--- /dev/null
+++ b/src/main/java/de/ids_mannheim/korap/tokenizer/Languages.java
@@ -0,0 +1,5 @@
+package de.ids_mannheim.korap.tokenizer;
+
+public @interface Languages {
+ String[] value();
+}
diff --git a/src/main/java/de/ids_mannheim/korap/tokenizer/Main.java b/src/main/java/de/ids_mannheim/korap/tokenizer/Main.java
index 26f48f7..7509b46 100644
--- a/src/main/java/de/ids_mannheim/korap/tokenizer/Main.java
+++ b/src/main/java/de/ids_mannheim/korap/tokenizer/Main.java
@@ -1,8 +1,6 @@
package de.ids_mannheim.korap.tokenizer;
-import io.github.classgraph.ClassGraph;
-import io.github.classgraph.ClassInfoList;
-import io.github.classgraph.ScanResult;
+import io.github.classgraph.*;
import picocli.CommandLine;
import java.io.*;
@@ -18,6 +16,55 @@
name = "koraptokenizer", version = "{}", description = "Tokenizes (and sentence splits) text input.")
public class Main implements Callable<Integer> {
+ public final String DEFAULT_LANGUAGE = "de";
+ public final String DEFAULT_TOKENIZER_CLASS_NAME = DerekoDfaTokenizer_de.class.getName();
+
+ @CommandLine.Spec
+ CommandLine.Model.CommandSpec spec;
+
+ public static String getTokenizerForLanguage(String languageTwoLetterCode) {
+ try (ScanResult scanResult = new ClassGraph().enableAllInfo().acceptPackages("*")
+ .scan()) {
+ ClassInfoList korapTokenizerClasses = scanResult.getClassesImplementing("de.ids_mannheim.korap.tokenizer.KorapTokenizer");
+ for (String n: korapTokenizerClasses.getNames()) {
+ AnnotationInfo v = scanResult.getClassInfo(n).getAnnotationInfo(Languages.class.getName());
+ if(v != null)
+ for (AnnotationParameterValue i : v.getParameterValues()) {
+ for (String lang : (String []) i.getValue()) {
+ if (lang.equals(languageTwoLetterCode)) {
+ return n;
+ }
+ }
+ }
+ }
+ }
+ return null;
+ }
+
+ static class AvailableLanguagesList extends ArrayList<String> {
+ AvailableLanguagesList() {
+ super(listKorAPTokenizerLanguages());
+ }
+
+ static List<String> listKorAPTokenizerLanguages() {
+ ArrayList<String> languages = new ArrayList<>();
+ try (ScanResult scanResult = new ClassGraph().enableAllInfo().acceptPackages("*")
+ .scan()) {
+ ClassInfoList korapTokenizerClasses = scanResult.getClassesImplementing("de.ids_mannheim.korap.tokenizer.KorapTokenizer");
+ for (String n: korapTokenizerClasses.getNames()) {
+ AnnotationInfo v = scanResult.getClassInfo(n).getAnnotationInfo(Languages.class.getName());
+ if(v != null)
+ for (AnnotationParameterValue i : v.getParameterValues()) {
+ for (String lang : (String []) i.getValue()) {
+ languages.add(lang);
+ }
+ }
+ }
+ }
+ return languages.stream().sorted().distinct().collect(Collectors.toList());
+ }
+ }
+
static class AvailableKorapTokenizerList extends ArrayList<String> {
AvailableKorapTokenizerList() {
super(listKorAPTokenizerImplementations());
@@ -37,7 +84,23 @@
@CommandLine.Option(names = {"-T", "--tokenizer-class"},
completionCandidates= AvailableKorapTokenizerList.class,
description = "Class name of the actual tokenizer that will be used (candidates: ${COMPLETION-CANDIDATES} default: ${DEFAULT-VALUE})")
- String tokenizerClassName = DerekoDfaTokenizer_de.class.getName();
+ String tokenizerClassName = DEFAULT_TOKENIZER_CLASS_NAME;
+
+
+ String language = DEFAULT_LANGUAGE;
+ @CommandLine.Option(names = {"-l", "--language"},
+ completionCandidates = AvailableLanguagesList.class,
+ description = "ISO-639-1 two letter language code (valid candidates: ${COMPLETION-CANDIDATES}; default: " + DEFAULT_LANGUAGE + ")")
+ public void setLanguage(String requestedLanguage) {
+ tokenizerClassName = getTokenizerForLanguage(requestedLanguage);
+ if (tokenizerClassName == null) {
+ throw new CommandLine.ParameterException(spec.commandLine(),
+ String.format("Invalid value '%s' for option '--language': " +
+ "(use one of: %s).", language,
+ AvailableLanguagesList.listKorAPTokenizerLanguages()));
+ }
+ language = requestedLanguage;
+ }
@CommandLine.Option(names = {"--no-tokens"}, negatable = true, description = "Print tokens (default: ${DEFAULT-VALUE})")
boolean tokens = true;
@@ -122,4 +185,3 @@
return 0;
}
}
-
diff --git a/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.jflex b/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.jflex
index 042080d..939260c 100644
--- a/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.jflex
+++ b/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.jflex
@@ -54,6 +54,8 @@
import java.util.ArrayList;
import java.util.List;
import opennlp.tools.util.Span;
+
+@Languages({ /*$"\""+target.language+"\" })$*/ /*-*/ ""})
%%
/**
@@ -70,7 +72,7 @@
%char
%{
- private static final CharSequence[] targetLanguages = { /*$"\""+target.language+"\"};$*/ /*-*/ "" };
+ private static final CharSequence[] targetLanguages = { /*$"\""+target.language+"\"};$*/ /*-*/ "" };
private boolean xmlEcho = false;
private boolean normalize = false;
private boolean debug = false;
@@ -546,7 +548,7 @@
// pragmas used for anonymization etc.
PRAGMA = \[_[A-Z\-]+_\]
-%include SEABBR_/*$target.language$*/.jflex-macro
+%include language-specific_/*$target.language$*/.jflex-macro
%s OPEN_QUOTE POLISH_CONDITIONAL_MODE JUST_AFTER_PERIOD CLITIC_MODE
diff --git a/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/SEABBR_de.jflex-macro b/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/language-specific_de.jflex-macro
similarity index 100%
rename from src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/SEABBR_de.jflex-macro
rename to src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/language-specific_de.jflex-macro
diff --git a/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/SEABBR_en.jflex-macro b/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/language-specific_en.jflex-macro
similarity index 100%
rename from src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/SEABBR_en.jflex-macro
rename to src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/language-specific_en.jflex-macro