| Marc Kupietz | 751868b | 2020-09-25 17:59:38 +0200 | [diff] [blame] | 1 | package de.ids_mannheim.korap.tokenizer; |
| 2 | |
| 3 | import opennlp.tools.util.Span; |
| 4 | |
| 5 | import java.io.IOException; |
| 6 | import java.io.PrintStream; |
| 7 | import java.io.Reader; |
| 8 | |
| 9 | public interface KorapTokenizer extends opennlp.tools.tokenize.Tokenizer, opennlp.tools.sentdetect.SentenceDetector { |
| 10 | void scan() throws IOException; |
| 11 | |
| 12 | /** |
| 13 | * Mainly targeted language(s) |
| 14 | * @return list of ISO 639 alpha-2 or alpha-3 language codes |
| 15 | * @apiNote will later be used to find appropriate implementations via reflection |
| 16 | */ |
| 17 | CharSequence[] getTargetLanguages(); |
| 18 | |
| 19 | void setInputReader(Reader inputReader); |
| 20 | |
| 21 | void setSplitSentences(boolean splitSentences); |
| 22 | |
| 23 | void setEcho(boolean echo); |
| 24 | |
| 25 | void setPrintOffsets(boolean printOffsets); |
| 26 | |
| 27 | void setPrintTokens(boolean tokenize); |
| 28 | |
| 29 | void setOutputStream(PrintStream outputStream); |
| 30 | |
| 31 | void setNormalize(boolean normalize); |
| 32 | |
| 33 | String[] tokenize(String s); |
| 34 | |
| 35 | Span[] tokenizePos(String s); |
| 36 | |
| 37 | String[] sentDetect(String s); |
| 38 | |
| 39 | Span[] sentPosDetect(String s); |
| 40 | |
| 41 | class Builder { |
| 42 | private boolean splitSentences; |
| 43 | private boolean echo; |
| 44 | private boolean printOffsets; |
| 45 | private boolean printTokens; |
| 46 | private PrintStream outputStream = System.out; |
| 47 | private boolean normalize; |
| 48 | private Class tokenizerClass; |
| 49 | private Reader inputReader; |
| 50 | |
| 51 | public Builder tokenizerClassName(String tokenizerClassName) throws ClassNotFoundException { |
| 52 | this.tokenizerClass = Class.forName(tokenizerClassName); |
| 53 | return this; |
| 54 | } |
| 55 | |
| 56 | public Builder splitSentences(boolean splitSentences) { |
| 57 | this.splitSentences = splitSentences; |
| 58 | return this; |
| 59 | } |
| 60 | |
| 61 | public Builder setEcho(boolean echo) { |
| 62 | this.echo = echo; |
| 63 | return this; |
| 64 | } |
| 65 | |
| 66 | public Builder printOffsets(boolean printOffsets) { |
| 67 | this.printOffsets = printOffsets; |
| 68 | return this; |
| 69 | } |
| 70 | |
| 71 | public Builder printTokens(boolean printTokens) { |
| 72 | this.printTokens = printTokens; |
| 73 | return this; |
| 74 | } |
| 75 | |
| 76 | public Builder inputReader(Reader inputReader) { |
| 77 | this.inputReader = inputReader; |
| 78 | return this; |
| 79 | } |
| 80 | |
| 81 | public Builder normalize(boolean normalize) { |
| 82 | this.normalize = normalize; |
| 83 | return this; |
| 84 | } |
| 85 | |
| 86 | public Builder outputStream(PrintStream outputStream) { |
| 87 | this.outputStream = outputStream; |
| 88 | return this; |
| 89 | } |
| 90 | |
| 91 | public KorapTokenizer build() throws IllegalAccessException, InstantiationException { |
| 92 | KorapTokenizer korapTokenizer = (KorapTokenizer) tokenizerClass.newInstance(); |
| 93 | korapTokenizer.setEcho(echo); |
| 94 | korapTokenizer.setInputReader(inputReader); |
| 95 | korapTokenizer.setOutputStream(outputStream); |
| 96 | korapTokenizer.setNormalize(normalize); |
| 97 | korapTokenizer.setPrintOffsets(printOffsets); |
| 98 | korapTokenizer.setSplitSentences(splitSentences); |
| 99 | korapTokenizer.setPrintTokens(printTokens); |
| 100 | return korapTokenizer; |
| 101 | } |
| 102 | } |
| 103 | } |