blob: 475a84361f76523a3fc51538d047f87f08805933 [file] [log] [blame]
package de.ids_mannheim.korap.tokenizer;
import io.github.classgraph.*;
import picocli.CommandLine;
import java.io.*;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.concurrent.Callable;
import java.util.stream.Collectors;
@CommandLine.Command(mixinStandardHelpOptions = true,
name = "koraptokenizer", version = "2.2.3", description = "Tokenizes (and sentence splits) text input.")
public class Main implements Callable<Integer> {
public final String DEFAULT_LANGUAGE = "de";
public final String DEFAULT_TOKENIZER_CLASS_NAME = DerekoDfaTokenizer_de.class.getName();
@CommandLine.Spec
CommandLine.Model.CommandSpec spec;
public static String getTokenizerForLanguage(String languageTwoLetterCode) {
try (ScanResult scanResult = new ClassGraph().enableAllInfo().acceptPackages("*")
.scan()) {
ClassInfoList korapTokenizerClasses = scanResult.getClassesImplementing("de.ids_mannheim.korap.tokenizer.KorapTokenizer");
for (String n: korapTokenizerClasses.getNames()) {
AnnotationInfo v = scanResult.getClassInfo(n).getAnnotationInfo(Languages.class.getName());
if(v != null)
for (AnnotationParameterValue i : v.getParameterValues()) {
for (String lang : (String []) i.getValue()) {
if (lang.equals(languageTwoLetterCode)) {
return n;
}
}
}
}
}
return null;
}
static class AvailableLanguagesList extends ArrayList<String> {
AvailableLanguagesList() {
super(listKorAPTokenizerLanguages());
}
static List<String> listKorAPTokenizerLanguages() {
ArrayList<String> languages = new ArrayList<>();
try (ScanResult scanResult = new ClassGraph().enableAllInfo().acceptPackages("*")
.scan()) {
ClassInfoList korapTokenizerClasses = scanResult.getClassesImplementing("de.ids_mannheim.korap.tokenizer.KorapTokenizer");
for (String n: korapTokenizerClasses.getNames()) {
AnnotationInfo v = scanResult.getClassInfo(n).getAnnotationInfo(Languages.class.getName());
if(v != null)
for (AnnotationParameterValue i : v.getParameterValues()) {
languages.addAll(Arrays.asList((String[]) i.getValue()));
}
}
}
return languages.stream().sorted().distinct().collect(Collectors.toList());
}
}
static class AvailableKorapTokenizerList extends ArrayList<String> {
AvailableKorapTokenizerList() {
super(listKorAPTokenizerImplementations());
}
static List<String> listKorAPTokenizerImplementations() {
List<String> korapTokenizerClassNames;
try (ScanResult scanResult = new ClassGraph().enableAllInfo().acceptPackages("*")
.scan()) {
ClassInfoList korapTokenizerClasses = scanResult.getClassesImplementing("de.ids_mannheim.korap.tokenizer.KorapTokenizer");
korapTokenizerClassNames = korapTokenizerClasses.getNames();
}
return korapTokenizerClassNames;
}
}
@CommandLine.Option(names = {"-T", "--tokenizer-class"},
completionCandidates= AvailableKorapTokenizerList.class,
description = "Class name of the actual tokenizer that will be used (candidates: ${COMPLETION-CANDIDATES} default: ${DEFAULT-VALUE})")
String tokenizerClassName = DEFAULT_TOKENIZER_CLASS_NAME;
String language = DEFAULT_LANGUAGE;
@CommandLine.Option(names = {"-l", "--language"},
completionCandidates = AvailableLanguagesList.class,
description = "ISO-639-1 two letter language code (valid candidates: ${COMPLETION-CANDIDATES}; default: " + DEFAULT_LANGUAGE + ")")
public void setLanguage(String requestedLanguage) {
tokenizerClassName = getTokenizerForLanguage(requestedLanguage);
if (tokenizerClassName == null) {
throw new CommandLine.ParameterException(spec.commandLine(),
String.format("Invalid value '%s' for option '--language': " +
"(use one of: %s).", language,
AvailableLanguagesList.listKorAPTokenizerLanguages()));
}
language = requestedLanguage;
}
@CommandLine.Option(names = {"--no-tokens"}, negatable = true, description = "Print tokens (default: ${DEFAULT-VALUE})")
boolean tokens = true;
@CommandLine.Option(names = {"-p", "--positions"}, description = "Print token start and end positions as character offsets (default: ${DEFAULT-VALUE})")
boolean positions = false;
@CommandLine.Option(names = {"-s", "--sentence-boundaries"}, description = "Print sentence boundary positions (default: ${DEFAULT-VALUE})")
boolean sentencize = false;
@CommandLine.Option(names = {"-ktt"}, hidden = true, description = "Deprecated. For internal use only. (default: ${DEFAULT-VALUE})")
boolean ktt = false;
@CommandLine.Option(names = {"-n", "--normalize"}, description = "Normalize tokens (default: ${DEFAULT-VALUE})")
boolean normalize = false;
@SuppressWarnings("CanBeFinal")
@CommandLine.Option(names = {"-o",
"--output-file"}, paramLabel = "FILE", description = "Output file (default: ${DEFAULT-VALUE})")
String output_filename = "-";
@SuppressWarnings("CanBeFinal")
@CommandLine.Option(names = {"-e",
"--encoding"}, description = "Input encoding (default: ${DEFAULT-VALUE})")
Charset encoding = StandardCharsets.UTF_8;
@SuppressWarnings("CanBeFinal")
@CommandLine.Option(names = {"--force"}, description = "Force overwrite (default: ${DEFAULT-VALUE})")
boolean force_overwrite = false;
@CommandLine.Parameters(arity = "0..*", paramLabel = "FILES", description = "input files")
private final ArrayList<String> inputFiles = new ArrayList<>();
public Main() {
}
public static void main(String[] args) {
new CommandLine(new Main()).execute(args);
}
@Override
public Integer call() throws FileNotFoundException {
final PrintStream output_stream;
if ((output_filename == null) || output_filename.equals("-")) {
output_stream = System.out;
} else {
File f = Utils.createFile(output_filename, force_overwrite);
output_stream = new PrintStream(new BufferedOutputStream(new FileOutputStream(f)));
}
for (int i = 0; i < inputFiles.size() || (i == 0 && inputFiles.size() == 0); i++) {
String fn = (inputFiles.size() > 0 ? inputFiles.get(i) : "-");
try {
BufferedReader br = "-".equals(fn) ? new BufferedReader(new InputStreamReader(System.in, encoding)) :
Files.newBufferedReader(new File(fn).toPath(), encoding);
new KorapTokenizer.Builder()
.tokenizerClassName(tokenizerClassName)
.inputReader(br)
.outputStream(output_stream)
.printTokens(tokens)
.printOffsets(positions)
.normalize(normalize)
.splitSentences(sentencize)
.setEcho(true)
.build()
.scan();
} catch (FileNotFoundException e) {
System.err.println("File not found : \"" + fn + "\"");
} catch (IOException e) {
System.err.println("IO error scanning file \"" + fn + "\"");
System.err.println(e);
} catch (Exception e) {
System.err.println("Unexpected exception:");
e.printStackTrace();
}
}
if ((output_filename != null) && !output_filename.equals("-")) {
output_stream.close();
}
return 0;
}
}