blob: cb8f280d07cedba17c38f06d2b417cd702904634 [file] [log] [blame]
Marc Kupietz751868b2020-09-25 17:59:38 +02001package de.ids_mannheim.korap.tokenizer;
2
3import opennlp.tools.util.Span;
4
5import java.io.IOException;
6import java.io.PrintStream;
7import java.io.Reader;
8
9public interface KorapTokenizer extends opennlp.tools.tokenize.Tokenizer, opennlp.tools.sentdetect.SentenceDetector {
10 void scan() throws IOException;
11
12 /**
13 * Mainly targeted language(s)
14 * @return list of ISO 639 alpha-2 or alpha-3 language codes
15 * @apiNote will later be used to find appropriate implementations via reflection
16 */
17 CharSequence[] getTargetLanguages();
18
19 void setInputReader(Reader inputReader);
20
21 void setSplitSentences(boolean splitSentences);
22
23 void setEcho(boolean echo);
24
25 void setPrintOffsets(boolean printOffsets);
26
27 void setPrintTokens(boolean tokenize);
28
29 void setOutputStream(PrintStream outputStream);
30
31 void setNormalize(boolean normalize);
32
33 String[] tokenize(String s);
34
35 Span[] tokenizePos(String s);
36
37 String[] sentDetect(String s);
38
39 Span[] sentPosDetect(String s);
40
41 class Builder {
42 private boolean splitSentences;
43 private boolean echo;
44 private boolean printOffsets;
45 private boolean printTokens;
46 private PrintStream outputStream = System.out;
47 private boolean normalize;
48 private Class tokenizerClass;
49 private Reader inputReader;
50
51 public Builder tokenizerClassName(String tokenizerClassName) throws ClassNotFoundException {
52 this.tokenizerClass = Class.forName(tokenizerClassName);
53 return this;
54 }
55
56 public Builder splitSentences(boolean splitSentences) {
57 this.splitSentences = splitSentences;
58 return this;
59 }
60
61 public Builder setEcho(boolean echo) {
62 this.echo = echo;
63 return this;
64 }
65
66 public Builder printOffsets(boolean printOffsets) {
67 this.printOffsets = printOffsets;
68 return this;
69 }
70
71 public Builder printTokens(boolean printTokens) {
72 this.printTokens = printTokens;
73 return this;
74 }
75
76 public Builder inputReader(Reader inputReader) {
77 this.inputReader = inputReader;
78 return this;
79 }
80
81 public Builder normalize(boolean normalize) {
82 this.normalize = normalize;
83 return this;
84 }
85
86 public Builder outputStream(PrintStream outputStream) {
87 this.outputStream = outputStream;
88 return this;
89 }
90
91 public KorapTokenizer build() throws IllegalAccessException, InstantiationException {
92 KorapTokenizer korapTokenizer = (KorapTokenizer) tokenizerClass.newInstance();
93 korapTokenizer.setEcho(echo);
94 korapTokenizer.setInputReader(inputReader);
95 korapTokenizer.setOutputStream(outputStream);
96 korapTokenizer.setNormalize(normalize);
97 korapTokenizer.setPrintOffsets(printOffsets);
98 korapTokenizer.setSplitSentences(splitSentences);
99 korapTokenizer.setPrintTokens(printTokens);
100 return korapTokenizer;
101 }
102 }
103}